diff mbox series

[FFmpeg-devel] lavc/aarch64: new optimization for 8-bit hevc_pel_uni_w_pixels, qpel_uni_w_h, qpel_uni_w_v, qpel_uni_w_hv and qpel_h

Message ID 530864e2-a55b-603e-00d4-f6876d391d9e@myais.com.cn
State New
Headers show
Series [FFmpeg-devel] lavc/aarch64: new optimization for 8-bit hevc_pel_uni_w_pixels, qpel_uni_w_h, qpel_uni_w_v, qpel_uni_w_hv and qpel_h | expand

Checks

Context Check Description
andriy/configure_x86 warning Failed to apply patch

Commit Message

Logan.Lyu April 30, 2023, 8:57 a.m. UTC
Hi,
This is a patch for the aarch64, which completes the neon versions of 
the hevc_pel_uni_w_pixels, qpel_uni_w_h, qpel_uni_w_v, qpel_uni_w_hv 
interfaces.

put_hevc_pel_uni_w_pixels4_8_c: 54.3
put_hevc_pel_uni_w_pixels4_8_neon: 24.1
put_hevc_pel_uni_w_pixels6_8_c: 105.3
put_hevc_pel_uni_w_pixels6_8_neon: 53.1
put_hevc_pel_uni_w_pixels8_8_c: 176.6
put_hevc_pel_uni_w_pixels8_8_neon: 63.8
put_hevc_pel_uni_w_pixels12_8_c: 391.1
put_hevc_pel_uni_w_pixels12_8_neon: 193.3
put_hevc_pel_uni_w_pixels16_8_c: 688.1
put_hevc_pel_uni_w_pixels16_8_neon: 226.1
put_hevc_pel_uni_w_pixels24_8_c: 1542.3
put_hevc_pel_uni_w_pixels24_8_neon: 536.8
put_hevc_pel_uni_w_pixels32_8_c: 2753.1
put_hevc_pel_uni_w_pixels32_8_neon: 875.8
put_hevc_pel_uni_w_pixels48_8_c: 6251.1
put_hevc_pel_uni_w_pixels48_8_neon: 1966.1
put_hevc_pel_uni_w_pixels64_8_c: 11047.1
put_hevc_pel_uni_w_pixels64_8_neon: 3449.8

put_hevc_qpel_uni_w_h4_8_c: 156.6
put_hevc_qpel_uni_w_h4_8_neon: 44.6
put_hevc_qpel_uni_w_h6_8_c: 324.6
put_hevc_qpel_uni_w_h6_8_neon: 103.1
put_hevc_qpel_uni_w_h8_8_c: 549.3
put_hevc_qpel_uni_w_h8_8_neon: 138.6
put_hevc_qpel_uni_w_h12_8_c: 1240.3
put_hevc_qpel_uni_w_h12_8_neon: 277.3
put_hevc_qpel_uni_w_h16_8_c: 2161.8
put_hevc_qpel_uni_w_h16_8_neon: 394.1
put_hevc_qpel_uni_w_h24_8_c: 4874.8
put_hevc_qpel_uni_w_h24_8_neon: 972.6
put_hevc_qpel_uni_w_h32_8_c: 8517.8
put_hevc_qpel_uni_w_h32_8_neon: 1517.3
put_hevc_qpel_uni_w_h48_8_c: 19856.1
put_hevc_qpel_uni_w_h48_8_neon: 3429.8
put_hevc_qpel_uni_w_h64_8_c: 35159.3
put_hevc_qpel_uni_w_h64_8_neon: 6018.1

put_hevc_qpel_uni_w_v4_8_c: 180.6
put_hevc_qpel_uni_w_v4_8_neon: 63.8
put_hevc_qpel_uni_w_v6_8_c: 318.6
put_hevc_qpel_uni_w_v6_8_neon: 117.8
put_hevc_qpel_uni_w_v8_8_c: 547.6
put_hevc_qpel_uni_w_v8_8_neon: 132.1
put_hevc_qpel_uni_w_v12_8_c: 1202.8
put_hevc_qpel_uni_w_v12_8_neon: 350.1
put_hevc_qpel_uni_w_v16_8_c: 2109.6
put_hevc_qpel_uni_w_v16_8_neon: 442.1
put_hevc_qpel_uni_w_v24_8_c: 4748.8
put_hevc_qpel_uni_w_v24_8_neon: 1287.1
put_hevc_qpel_uni_w_v32_8_c: 8487.3
put_hevc_qpel_uni_w_v32_8_neon: 1704.3
put_hevc_qpel_uni_w_v48_8_c: 18798.8
put_hevc_qpel_uni_w_v48_8_neon: 3790.8
put_hevc_qpel_uni_w_v64_8_c: 35614.6
put_hevc_qpel_uni_w_v64_8_neon: 6725.6


put_hevc_qpel_uni_w_hv4_8_c: 498.8
put_hevc_qpel_uni_w_hv4_8_neon: 139.3
put_hevc_qpel_uni_w_hv6_8_c: 874.6
put_hevc_qpel_uni_w_hv6_8_neon: 295.3
put_hevc_qpel_uni_w_hv8_8_c: 1372.1
put_hevc_qpel_uni_w_hv8_8_neon: 387.1
put_hevc_qpel_uni_w_hv12_8_c: 2721.8
put_hevc_qpel_uni_w_hv12_8_neon: 804.8
put_hevc_qpel_uni_w_hv16_8_c: 4503.1
put_hevc_qpel_uni_w_hv16_8_neon: 1038.1
put_hevc_qpel_uni_w_hv24_8_c: 9321.8
put_hevc_qpel_uni_w_hv24_8_neon: 2962.1
put_hevc_qpel_uni_w_hv32_8_c: 15926.8
put_hevc_qpel_uni_w_hv32_8_neon: 3858.6
put_hevc_qpel_uni_w_hv48_8_c: 35051.1
put_hevc_qpel_uni_w_hv48_8_neon: 9301.1
put_hevc_qpel_uni_w_hv64_8_c: 61215.3
put_hevc_qpel_uni_w_hv64_8_neon: 14920.1

put_hevc_qpel_uni_h4_8_c: 143.3
put_hevc_qpel_uni_h4_8_neon: 55.3
put_hevc_qpel_uni_h6_8_c: 304.6
put_hevc_qpel_uni_h6_8_neon: 82.3
put_hevc_qpel_uni_h8_8_c: 557.8
put_hevc_qpel_uni_h8_8_neon: 99.3
put_hevc_qpel_uni_h12_8_c: 1228.3
put_hevc_qpel_uni_h12_8_neon: 251.6
put_hevc_qpel_uni_h16_8_c: 2210.3
put_hevc_qpel_uni_h16_8_neon: 324.6
put_hevc_qpel_uni_h24_8_c: 4859.1
put_hevc_qpel_uni_h24_8_neon: 962.3
put_hevc_qpel_uni_h32_8_c: 8728.6
put_hevc_qpel_uni_h32_8_neon: 1249.6
put_hevc_qpel_uni_h48_8_c: 20346.3
put_hevc_qpel_uni_h48_8_neon: 2824.1
put_hevc_qpel_uni_h64_8_c: 36702.6
put_hevc_qpel_uni_h64_8_neon: 5012.1




Signed-off-by: myais <Logan.Lyu@myais.com.cn>
---
  libavcodec/aarch64/hevcdsp_init_aarch64.c |   96 +
  libavcodec/aarch64/hevcdsp_qpel_neon.S    | 2223 +++++++++++++++++++++
  2 files changed, 2319 insertions(+)

  put_hevc qpel
  put_hevc qpel_uni
  put_hevc qpel_bi
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.8h, w7
+1:
+        ldr     s0, [x2]
+        ldr     s1, [x2, x3]
+        add     x2, x2, x3, lsl 1
+        ushll   v0.8h, v0.8b, #6
+        ushll   v1.8h, v1.8b, #6
+        smull   v0.4s, v0.4h, v30.4h
+        smull   v1.4s, v1.4h, v30.4h
+        sqrshl  v0.4s, v0.4s, v31.4s
+        sqrshl  v1.4s, v1.4s, v31.4s
+        sqadd   v0.4s, v0.4s, v29.4s
+        sqadd   v1.4s, v1.4s, v29.4s
+        sqxtn  v0.4h, v0.4s
+        sqxtn  v1.4h, v1.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        str     s0, [x0]
+        str     s1, [x0, x1]
+        add     x0, x0, x1, lsl 1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels6_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+        sub     x1, x1, #4
+1:
+        ldr     d0, [x2]
+        ldr     d1, [x2, x3]
+        add     x2, x2, x3, lsl 1
+        ushll   v0.8h, v0.8b, #6
+        ushll   v1.8h, v1.8b, #6
+        smull   v4.4s, v0.4h, v30.4h
+        smull2  v5.4s, v0.8h, v30.8h
+        smull   v6.4s, v1.4h, v30.4h
+        smull2  v7.4s, v1.8h, v30.8h
+        sqrshl  v4.4s, v4.4s, v31.4s
+        sqrshl  v5.4s, v5.4s, v31.4s
+        sqrshl  v6.4s, v6.4s, v31.4s
+        sqrshl  v7.4s, v7.4s, v31.4s
+        sqadd   v4.4s, v4.4s, v29.4s
+        sqadd   v5.4s, v5.4s, v29.4s
+        sqadd   v6.4s, v6.4s, v29.4s
+        sqadd   v7.4s, v7.4s, v29.4s
+        sqxtn   v0.4h, v4.4s
+        sqxtn2  v0.8h, v5.4s
+        sqxtn   v1.4h, v6.4s
+        sqxtn2  v1.8h, v7.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        str     s0, [x0], #4
+        st1     {v0.h}[2], [x0], x1
+        str     s1, [x0], #4
+        st1     {v1.h}[2], [x0], x1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret +endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ldr     d0, [x2]
+        ldr     d1, [x2, x3]
+        add     x2, x2, x3, lsl 1
+        ushll   v0.8h, v0.8b, #6
+        ushll   v1.8h, v1.8b, #6
+        smull   v4.4s, v0.4h, v30.4h
+        smull2  v5.4s, v0.8h, v30.8h
+        smull   v6.4s, v1.4h, v30.4h
+        smull2  v7.4s, v1.8h, v30.8h
+        sqrshl  v4.4s, v4.4s, v31.4s
+        sqrshl  v5.4s, v5.4s, v31.4s
+        sqrshl  v6.4s, v6.4s, v31.4s
+        sqrshl  v7.4s, v7.4s, v31.4s
+        sqadd   v4.4s, v4.4s, v29.4s
+        sqadd   v5.4s, v5.4s, v29.4s
+        sqadd   v6.4s, v6.4s, v29.4s
+        sqadd   v7.4s, v7.4s, v29.4s
+        sqxtn   v0.4h, v4.4s
+        sqxtn2  v0.8h, v5.4s
+        sqxtn   v1.4h, v6.4s
+        sqxtn2  v1.8h, v7.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        str     d0, [x0]
+        str     d1, [x0, x1]
+        add     x0, x0, x1, lsl 1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels12_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+        sub     x1, x1, #8
+1:
+        ldr     q0, [x2]
+        ldr     q1, [x2, x3]
+        add     x2, x2, x3, lsl 1
+        ushll   v4.8h, v0.8b, #6
+        ushll2  v5.8h, v0.16b, #6
+        ushll   v6.8h, v1.8b, #6
+        ushll2  v7.8h, v1.16b, #6
+        smull   v16.4s, v4.4h, v30.4h
+        smull2  v17.4s, v4.8h, v30.8h
+        smull   v18.4s, v5.4h, v30.4h
+        smull2  v19.4s, v5.8h, v30.8h
+        smull   v20.4s, v6.4h, v30.4h
+        smull2  v21.4s, v6.8h, v30.8h
+        smull   v22.4s, v7.4h, v30.4h
+        smull2  v23.4s, v7.8h, v30.8h
+        +        sqrshl  v16.4s, v16.4s, v31.4s
+        sqrshl  v17.4s, v17.4s, v31.4s
+        sqrshl  v18.4s, v18.4s, v31.4s
+        sqrshl  v19.4s, v19.4s, v31.4s
+        sqrshl  v20.4s, v20.4s, v31.4s
+        sqrshl  v21.4s, v21.4s, v31.4s
+        sqrshl  v22.4s, v22.4s, v31.4s
+        sqrshl  v23.4s, v23.4s, v31.4s
+        sqadd   v16.4s, v16.4s, v29.4s
+        sqadd   v17.4s, v17.4s, v29.4s
+        sqadd   v18.4s, v18.4s, v29.4s
+        sqadd   v19.4s, v19.4s, v29.4s
+        sqadd   v20.4s, v20.4s, v29.4s
+        sqadd   v21.4s, v21.4s, v29.4s
+        sqadd   v22.4s, v22.4s, v29.4s
+        sqadd   v23.4s, v23.4s, v29.4s
+        sqxtn   v0.4h, v16.4s
+        sqxtn2  v0.8h, v17.4s
+        sqxtn   v1.4h, v18.4s
+        sqxtn2  v1.8h, v19.4s
+        sqxtn   v2.4h, v20.4s
+        sqxtn2  v2.8h, v21.4s
+        sqxtn   v3.4h, v22.4s
+        sqxtn2  v3.8h, v23.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun2 v0.16b, v1.8h
+        sqxtun  v2.8b, v2.8h
+        sqxtun2 v2.16b, v3.8h
+        str     d0, [x0], #8
+        st1     {v0.s}[2], [x0], x1
+        str     d2, [x0], #8
+        st1     {v2.s}[2], [x0], x1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+.macro PEL_UNI_W_PIXEL_CALC     s0, t0, t1, d0, d1, d2, d3
+        ushll   \t0\().8h, \s0\().8b, #6
+        ushll2  \t1\().8h, \s0\().16b, #6
+        smull   \d0\().4s, \t0\().4h, v30.4h
+        smull2  \d1\().4s, \t0\().8h, v30.8h
+        smull   \d2\().4s, \t1\().4h, v30.4h
+        smull2  \d3\().4s, \t1\().8h, v30.8h
+        sqrshl  \d0\().4s, \d0\().4s, v31.4s
+        sqrshl  \d1\().4s, \d1\().4s, v31.4s
+        sqrshl  \d2\().4s, \d2\().4s, v31.4s
+        sqrshl  \d3\().4s, \d3\().4s, v31.4s
+        sqadd   \d0\().4s, \d0\().4s, v29.4s
+        sqadd   \d1\().4s, \d1\().4s, v29.4s
+        sqadd   \d2\().4s, \d2\().4s, v29.4s
+        sqadd   \d3\().4s, \d3\().4s, v29.4s
+        sqxtn   \t0\().4h, \d0\().4s
+        sqxtn2  \t0\().8h, \d1\().4s
+        sqxtn   \t1\().4h, \d2\().4s
+        sqxtn2  \t1\().8h, \d3\().4s
+        sqxtun  \s0\().8b,  \t0\().8h
+        sqxtun2 \s0\().16b, \t1\().8h
+.endm
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels16_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ldr     q0, [x2]
+        ldr     q1, [x2, x3]
+        add     x2, x2, x3, lsl 1
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        str     q0, [x0]
+        str     q1, [x0, x1]
+        add     x0, x0, x1, lsl 1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels24_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ld1     {v0.16b, v1.16b}, [x2], x3
+        ushll   v4.8h, v0.8b, #6
+        ushll2  v5.8h, v0.16b, #6
+        ushll   v6.8h, v1.8b, #6
+        smull   v16.4s, v4.4h, v30.4h
+        smull2  v17.4s, v4.8h, v30.8h
+        smull   v18.4s, v5.4h, v30.4h
+        smull2  v19.4s, v5.8h, v30.8h
+        smull   v20.4s, v6.4h, v30.4h
+        smull2  v21.4s, v6.8h, v30.8h
+        sqrshl  v16.4s, v16.4s, v31.4s
+        sqrshl  v17.4s, v17.4s, v31.4s
+        sqrshl  v18.4s, v18.4s, v31.4s
+        sqrshl  v19.4s, v19.4s, v31.4s
+        sqrshl  v20.4s, v20.4s, v31.4s
+        sqrshl  v21.4s, v21.4s, v31.4s
+        sqadd   v16.4s, v16.4s, v29.4s
+        sqadd   v17.4s, v17.4s, v29.4s
+        sqadd   v18.4s, v18.4s, v29.4s
+        sqadd   v19.4s, v19.4s, v29.4s
+        sqadd   v20.4s, v20.4s, v29.4s
+        sqadd   v21.4s, v21.4s, v29.4s
+        sqxtn   v0.4h, v16.4s
+        sqxtn2  v0.8h, v17.4s
+        sqxtn   v1.4h, v18.4s
+        sqxtn2  v1.8h, v19.4s
+        sqxtn   v2.4h, v20.4s
+        sqxtn2  v2.8h, v21.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        sqxtun  v2.8b, v2.8h
+        st1     {v0.8b, v1.8b, v2.8b}, [x0], x1
+        subs    w4, w4, #1
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels32_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ld1     {v0.16b, v1.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        st1     {v0.16b, v1.16b}, [x0], x1
+        subs    w4, w4, #1
+        b.ne    1b
+        ret
+endfunc
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels48_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ld1     {v0.16b, v1.16b, v2.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
+        st1     {v0.16b, v1.16b, v2.16b}, [x0], x1
+        subs    w4, w4, #1
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels64_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ld1     {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v3, v6, v7, v20, v21, v22, v23
+        st1     {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+        subs    w4, w4, #1
+        b.ne    1b
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_HEADER
+        ldur            x12, [sp, #8]          // my
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        movrel          x9, qpel_filters_abs
+        add             x9, x9, x12, lsl 3
+        ldr             d28, [x9]
+        dup             v0.16b, v28.b[0]
+        dup             v1.16b, v28.b[1]
+        dup             v2.16b, v28.b[2]
+        dup             v3.16b, v28.b[3]
+        dup             v4.16b, v28.b[4]
+        dup             v5.16b, v28.b[5]
+        dup             v6.16b, v28.b[6]
+        dup             v7.16b, v28.b[7]
+
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6              // wx
+        dup             v31.4s, w10             // shift
+        dup             v29.4s, w7              // ox
+.endm
+
+.macro QPEL_FILTER_B dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umull           \dst\().8h, \src1\().8b, v1.8b
+        umlsl           \dst\().8h, \src0\().8b, v0.8b
+        umlsl           \dst\().8h, \src2\().8b, v2.8b
+        umlal           \dst\().8h, \src3\().8b, v3.8b
+        umlal           \dst\().8h, \src4\().8b, v4.8b
+        umlsl           \dst\().8h, \src5\().8b, v5.8b
+        umlal           \dst\().8h, \src6\().8b, v6.8b
+        umlsl           \dst\().8h, \src7\().8b, v7.8b
+.endm
+
+.macro QPEL_FILTER_B2 dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umull2          \dst\().8h, \src1\().16b, v1.16b
+        umlsl2          \dst\().8h, \src0\().16b, v0.16b
+        umlsl2          \dst\().8h, \src2\().16b, v2.16b
+        umlal2          \dst\().8h, \src3\().16b, v3.16b
+        umlal2          \dst\().8h, \src4\().16b, v4.16b
+        umlsl2          \dst\().8h, \src5\().16b, v5.16b
+        umlal2          \dst\().8h, \src6\().16b, v6.16b
+        umlsl2          \dst\().8h, \src7\().16b, v7.16b
+.endm
+
+.macro  QPEL_UNI_W_V_4
+        smull           v24.4s, v24.4h, v30.4h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtun          v24.8b, v24.8h
+        st1             {v24.s}[0], [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v4_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             s16, [x2]
+        ldr             s17, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             s18, [x2]
+        ldr             s19, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             s20, [x2]
+        ldr             s21, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             s22, [x2]
+
+1:      ldr             s23, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v24, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s16, [x2]
+        QPEL_FILTER_B     v24, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s17, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v24, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s18, [x2]
+        QPEL_FILTER_B     v24, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s19, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v24, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s20, [x2]
+        QPEL_FILTER_B     v24, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s21, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v24, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s22, [x2]
+        QPEL_FILTER_B     v24, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_8
+        smull           v24.4s, v26.4h, v30.4h
+        smull2          v25.4s, v26.8h, v30.8h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqrshl          v25.4s, v25.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqadd           v25.4s, v25.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtn2          v24.8h, v25.4s
+        sqxtun          v24.8b, v24.8h
+        st1             {v24.d}[0], [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v8_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             d16, [x2]
+        ldr             d17, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             d18, [x2]
+        ldr             d19, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             d20, [x2]
+        ldr             d21, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             d22, [x2]
+
+1:      ldr             d23, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d16, [x2]
+        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d17, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d18, [x2]
+        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d19, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d20, [x2]
+        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d21, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d22, [x2]
+        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_16
+        smull           v24.4s, v26.4h, v30.4h
+        smull2          v25.4s, v26.8h, v30.8h
+        smull           v26.4s, v27.4h, v30.4h
+        smull2          v27.4s, v27.8h, v30.8h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqrshl          v25.4s, v25.4s, v31.4s
+        sqrshl          v26.4s, v26.4s, v31.4s
+        sqrshl          v27.4s, v27.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqadd           v25.4s, v25.4s, v29.4s
+        sqadd           v26.4s, v26.4s, v29.4s
+        sqadd           v27.4s, v27.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtn2          v24.8h, v25.4s
+        sqxtn           v26.4h, v26.4s
+        sqxtn2          v26.8h, v27.4s
+        sqxtun          v24.8b, v24.8h
+        sqxtun2         v24.16b, v26.8h
+        st1             {v24.16b}, [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v16_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             q16, [x2]
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             q18, [x2]
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             q20, [x2]
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             q22, [x2]
+
+1:      ldr             q23, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_B2    v27, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q16, [x2]
+        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_B2    v27, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_B2    v27, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q18, [x2]
+        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_B2    v27, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_B2    v27, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q20, [x2]
+        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_B2    v27, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_B2    v27, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q22, [x2]
+        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_B2    v27, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldur            w13, [sp, #16]
+        mov             x14, x0
+        mov             x15, x2
+        mov             w11, w4
+
+3:
+        ldr             q16, [x2]
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             q18, [x2]
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             q20, [x2]
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             q22, [x2]
+
+
+1:      ldr             q23, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_B2    v27, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q16, [x2]
+        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_B2    v27, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_B2    v27, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q18, [x2]
+        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_B2    v27, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_B2    v27, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q20, [x2]
+        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_B2    v27, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_B2    v27, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q22, [x2]
+        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_B2    v27, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        subs            w13, w13, #16
+        add             x14, x14, #16
+        add             x15, x15, #16
+        mov             x0, x14
+        mov             x2, x15
+        mov             w4, w11
+        b.hi            3b
+        ret
+endfunc
+
+#if __ARM_FEATURE_DOTPROD
+.macro QPEL_UNI_W_H_HEADER
+        ldr             x12, [sp]
+        sub             x2, x2, #3
+        movrel          x9, qpel_filters
+        add             x9, x9, x12, lsl 3
+        ldr             x11, [x9]
+        dup             v28.2d, x11
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.4s, w6              // wx
+        dup             v31.4s, w10             // shift
+        dup             v29.4s, w7              // ox
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_h4_8_neon_dotprod, export=1
+        QPEL_UNI_W_H_HEADER
+1:
+        ld1             {v0.16b}, [x2], x3
+        ext             v1.16b, v0.16b, v0.16b, #1
+        ext             v2.16b, v0.16b, v0.16b, #2
+        ext             v3.16b, v0.16b, v0.16b, #3
+        zip1            v0.2d, v0.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        usdot           v16.4s, v0.16b, v28.16b
+        usdot           v17.4s, v2.16b, v28.16b
+        addp            v16.4s, v16.4s, v17.4s
+        mul             v16.4s, v16.4s, v30.4s
+        sqrshl          v16.4s, v16.4s, v31.4s
+        sqadd           v16.4s, v16.4s, v29.4s
+        sqxtn           v16.4h, v16.4s
+        sqxtun          v16.8b, v16.8h
+        str             s16, [x0]
+        add             x0, x0, x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_h6_8_neon_dotprod, export=1
+        QPEL_UNI_W_H_HEADER
+        sub             x1, x1, #4
+1:
+        ld1             {v0.16b}, [x2], x3
+        ext             v1.16b, v0.16b, v0.16b, #1
+        ext             v2.16b, v0.16b, v0.16b, #2
+        ext             v3.16b, v0.16b, v0.16b, #3
+        ext             v4.16b, v0.16b, v0.16b, #4
+        ext             v5.16b, v0.16b, v0.16b, #5
+        zip1            v0.2d, v0.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        zip1            v4.2d, v4.2d, v5.2d
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        usdot           v16.4s, v0.16b, v28.16b
+        usdot           v17.4s, v2.16b, v28.16b
+        usdot           v18.4s, v4.16b, v28.16b
+        addp            v16.4s, v16.4s, v17.4s
+        addp            v18.4s, v18.4s, v18.4s
+        mul             v16.4s, v16.4s, v30.4s
+        mul             v18.2s, v18.2s, v30.2s
+        sqrshl          v16.4s, v16.4s, v31.4s
+        sqrshl          v18.2s, v18.2s, v31.2s
+        sqadd           v16.4s, v16.4s, v29.4s
+        sqadd           v18.2s, v18.2s, v29.2s
+        sqxtn           v16.4h, v16.4s
+        sqxtn2          v16.8h, v18.4s
+        sqxtun          v16.8b, v16.8h
+        str             s16, [x0], #4
+        st1             {v16.h}[2], [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+
+.macro  QPEL_UNI_W_H_CALC s0, s1, s2, s3, d0, d1, d2, d3
+        movi            \d0\().2d, #0
+        movi            \d1\().2d, #0
+        movi            \d2\().2d, #0
+        movi            \d3\().2d, #0
+        usdot           \d0\().4s, \s0\().16b, v28.16b
+        usdot           \d1\().4s, \s1\().16b, v28.16b
+        usdot           \d2\().4s, \s2\().16b, v28.16b
+        usdot           \d3\().4s, \s3\().16b, v28.16b
+        addp            \d0\().4s, \d0\().4s, \d1\().4s
+        addp            \d2\().4s, \d2\().4s, \d3\().4s
+        mul             \d0\().4s, \d0\().4s, v30.4s
+        mul             \d2\().4s, \d2\().4s, v30.4s
+        sqrshl          \d0\().4s, \d0\().4s, v31.4s
+        sqrshl          \d2\().4s, \d2\().4s, v31.4s
+        sqadd           \d0\().4s, \d0\().4s, v29.4s
+        sqadd           \d2\().4s, \d2\().4s, v29.4s
+.endm
+
+.macro  QPEL_UNI_W_H_CALC_HALF s0, s1, d0, d1
+        movi            \d0\().2d, #0
+        movi            \d1\().2d, #0
+        usdot           \d0\().4s, \s0\().16b, v28.16b
+        usdot           \d1\().4s, \s1\().16b, v28.16b
+        addp            \d0\().4s, \d0\().4s, \d1\().4s
+        mul             \d0\().4s, \d0\().4s, v30.4s
+        sqrshl          \d0\().4s, \d0\().4s, v31.4s
+        sqadd           \d0\().4s, \d0\().4s, v29.4s
+.endm
+
+
+function ff_hevc_put_hevc_qpel_uni_w_h8_8_neon_dotprod, export=1
+        QPEL_UNI_W_H_HEADER
+1:
+        ld1             {v16.16b, v17.16b}, [x2], x3
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        zip1            v0.2d, v16.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        zip1            v4.2d, v4.2d, v5.2d
+        zip1            v6.2d, v6.2d, v7.2d
+        QPEL_UNI_W_H_CALC  v0, v2, v4, v6,  v18, v19, v20, v21
+        sqxtn           v18.4h, v18.4s
+        sqxtn2          v18.8h, v20.4s
+        sqxtun          v18.8b, v18.8h
+        str             d18, [x0]
+        add             x0, x0, x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_h12_8_neon_dotprod, export=1
+        QPEL_UNI_W_H_HEADER
+        add             x13, x0, #8
+1:
+        ld1             {v16.16b, v17.16b}, [x2], x3
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        zip1            v18.2d, v16.2d, v1.2d
+        zip1            v19.2d, v2.2d, v3.2d
+        zip1            v20.2d, v4.2d, v5.2d
+        zip1            v21.2d, v6.2d, v7.2d
+        zip2            v22.2d, v16.2d, v1.2d
+        zip2            v23.2d, v2.2d, v3.2d
+        QPEL_UNI_W_H_CALC  v18, v19, v20, v21, v0, v2, v4, v6
+        QPEL_UNI_W_H_CALC_HALF v22, v23, v24, v25
+        sqxtn           v0.4h, v0.4s
+        sqxtn2          v0.8h, v4.4s
+        sqxtn           v1.4h, v24.4s
+        sqxtun          v0.8b, v0.8h
+        sqxtun          v1.8b, v1.8h
+
+        str             d0, [x0]
+        str             s1, [x13]
+        add             x0, x0, x1
+        add             x13, x13, x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_h16_8_neon_dotprod, export=1
+        QPEL_UNI_W_H_HEADER
+1:
+        ld1             {v16.16b, v17.16b}, [x2], x3
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v18, v19, v20, v21   // 
v18: 0, 8, 2, 10 v20: 1, 9, 3, 11
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25    // 
v22: 4, 12, 6, 14 v24: 5, 13, 7, 15
+        sqxtn           v0.4h, v18.4s
+        sqxtn2          v0.8h, v22.4s
+        sqxtn           v1.4h, v20.4s
+        sqxtn2          v1.8h, v24.4s
+        trn1            v2.8h, v0.8h, v1.8h
+        trn2            v3.8h, v0.8h, v1.8h
+        sqxtun          v0.8b, v2.8h
+        sqxtun2         v0.16b, v3.8h
+        st1             {v0.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_h24_8_neon_dotprod, export=1
+        QPEL_UNI_W_H_HEADER
+        sub             x1, x1, #16
+1:
+        ld1             {v16.16b, v17.16b}, [x2], x3
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v18, v19, v20, v21
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25
+        sqxtn           v18.4h, v18.4s
+        sqxtn2          v18.8h, v22.4s
+        sqxtn           v19.4h, v20.4s
+        sqxtn2          v19.8h, v24.4s
+        trn1            v20.8h, v18.8h, v19.8h
+        trn2            v21.8h, v18.8h, v19.8h
+        sqxtun          v26.8b, v20.8h
+        sqxtun2         v26.16b, v21.8h                         // 0-15
+        ext             v1.16b, v17.16b, v17.16b, #1
+        ext             v2.16b, v17.16b, v17.16b, #2
+        ext             v3.16b, v17.16b, v17.16b, #3
+        ext             v4.16b, v17.16b, v17.16b, #4
+        ext             v5.16b, v17.16b, v17.16b, #5
+        ext             v6.16b, v17.16b, v17.16b, #6
+        ext             v7.16b, v17.16b, v17.16b, #7
+        zip1            v0.2d, v17.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        zip1            v4.2d, v4.2d, v5.2d
+        zip1            v6.2d, v6.2d, v7.2d
+        QPEL_UNI_W_H_CALC  v0, v2, v4, v6, v18, v19, v20, v21
+        sqxtn           v18.4h, v18.4s
+        sqxtn2          v18.8h, v20.4s
+        sqxtun          v27.8b, v18.8h
+
+        st1             {v26.16b}, [x0], #16
+        st1             {v27.8b}, [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+
+function ff_hevc_put_hevc_qpel_uni_w_h32_8_neon_dotprod, export=1
+        QPEL_UNI_W_H_HEADER
+1:
+        ld1             {v16.16b, v17.16b, v18.16b}, [x2], x3
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v0, v19, v20, v21
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25
+        sqxtn           v0.4h, v0.4s
+        sqxtn2          v0.8h, v22.4s
+        sqxtn           v19.4h, v20.4s
+        sqxtn2          v19.8h, v24.4s
+        trn1            v20.8h, v0.8h, v19.8h
+        trn2            v21.8h, v0.8h, v19.8h
+        sqxtun          v26.8b, v20.8h
+        sqxtun2         v26.16b, v21.8h                         // 0-15
+        ext             v1.16b, v17.16b, v18.16b, #1
+        ext             v2.16b, v17.16b, v18.16b, #2
+        ext             v3.16b, v17.16b, v18.16b, #3
+        ext             v4.16b, v17.16b, v18.16b, #4
+        ext             v5.16b, v17.16b, v18.16b, #5
+        ext             v6.16b, v17.16b, v18.16b, #6
+        ext             v7.16b, v17.16b, v18.16b, #7
+        QPEL_UNI_W_H_CALC  v17, v2, v1, v3, v0, v19, v20, v21
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25
+        sqxtn           v0.4h, v0.4s
+        sqxtn2          v0.8h, v22.4s
+        sqxtn           v19.4h, v20.4s
+        sqxtn2          v19.8h, v24.4s
+        trn1            v20.8h, v0.8h, v19.8h
+        trn2            v21.8h, v0.8h, v19.8h
+        sqxtun          v27.8b, v20.8h
+        sqxtun2         v27.16b, v21.8h                         // 16-31
+        st1             {v26.16b, v27.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_h48_8_neon_dotprod, export=1
+        QPEL_UNI_W_H_HEADER
+1:
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v25.8b, v22.8h
+        sqxtun2         v25.16b, v23.8h                         // 0-15
+        ext             v1.16b, v17.16b, v18.16b, #1
+        ext             v2.16b, v17.16b, v18.16b, #2
+        ext             v3.16b, v17.16b, v18.16b, #3
+        ext             v4.16b, v17.16b, v18.16b, #4
+        ext             v5.16b, v17.16b, v18.16b, #5
+        ext             v6.16b, v17.16b, v18.16b, #6
+        ext             v7.16b, v17.16b, v18.16b, #7
+        QPEL_UNI_W_H_CALC  v17, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v26.8b, v22.8h
+        sqxtun2         v26.16b, v23.8h                         // 16-31
+        ext             v1.16b, v18.16b, v19.16b, #1
+        ext             v2.16b, v18.16b, v19.16b, #2
+        ext             v3.16b, v18.16b, v19.16b, #3
+        ext             v4.16b, v18.16b, v19.16b, #4
+        ext             v5.16b, v18.16b, v19.16b, #5
+        ext             v6.16b, v18.16b, v19.16b, #6
+        ext             v7.16b, v18.16b, v19.16b, #7
+        QPEL_UNI_W_H_CALC  v18, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v27.8b, v22.8h
+        sqxtun2         v27.16b, v23.8h                         // 32-47
+        st1             {v25.16b, v26.16b, v27.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_qpel_uni_w_h64_8_neon_dotprod, export=1
+        QPEL_UNI_W_H_HEADER
+        sub             x3, x3, #64
+1:
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v16.8b, v22.8h
+        sqxtun2         v16.16b, v23.8h                         // 0-15
+        ext             v1.16b, v17.16b, v18.16b, #1
+        ext             v2.16b, v17.16b, v18.16b, #2
+        ext             v3.16b, v17.16b, v18.16b, #3
+        ext             v4.16b, v17.16b, v18.16b, #4
+        ext             v5.16b, v17.16b, v18.16b, #5
+        ext             v6.16b, v17.16b, v18.16b, #6
+        ext             v7.16b, v17.16b, v18.16b, #7
+        QPEL_UNI_W_H_CALC  v17, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v17.8b, v22.8h
+        sqxtun2         v17.16b, v23.8h                         // 16-31
+        ext             v1.16b, v18.16b, v19.16b, #1
+        ext             v2.16b, v18.16b, v19.16b, #2
+        ext             v3.16b, v18.16b, v19.16b, #3
+        ext             v4.16b, v18.16b, v19.16b, #4
+        ext             v5.16b, v18.16b, v19.16b, #5
+        ext             v6.16b, v18.16b, v19.16b, #6
+        ext             v7.16b, v18.16b, v19.16b, #7
+        QPEL_UNI_W_H_CALC  v18, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        ld1             {v0.16b}, [x2], x3
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v18.8b, v22.8h
+        sqxtun2         v18.16b, v23.8h                         // 32-47
+        ext             v1.16b, v19.16b, v0.16b, #1
+        ext             v2.16b, v19.16b, v0.16b, #2
+        ext             v3.16b, v19.16b, v0.16b, #3
+        ext             v4.16b, v19.16b, v0.16b, #4
+        ext             v5.16b, v19.16b, v0.16b, #5
+        ext             v6.16b, v19.16b, v0.16b, #6
+        ext             v7.16b, v19.16b, v0.16b, #7
+        QPEL_UNI_W_H_CALC  v19, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v19.8b, v22.8h
+        sqxtun2         v19.16b, v23.8h                         // 48-63
+
+        st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+
+.macro QPEL_H_HEADER
+        movrel          x9, qpel_filters
+        add             x9, x9, x4, lsl 3
+        ldr             x11, [x9]
+        dup             v31.2d, x11
+        sub             x1, x1, #3
+.endm
+
+function ff_hevc_put_hevc_qpel_h4_8_neon_dotprod, export=1
+        QPEL_H_HEADER
+        mov             x10, #MAX_PB_SIZE * 2
+1:
+        ld1             {v0.16b}, [x1], x2
+        ext             v1.16b, v0.16b, v0.16b, #1
+        ext             v2.16b, v0.16b, v0.16b, #2
+        ext             v3.16b, v0.16b, v0.16b, #3
+        zip1            v0.2d, v0.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        usdot           v16.4s, v0.16b, v31.16b
+        usdot           v17.4s, v2.16b, v31.16b
+        addp            v16.4s, v16.4s, v17.4s
+        sqxtn           v16.4h, v16.4s
+        str             d16, [x0]
+        add             x0, x0, x10
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h6_8_neon_dotprod, export=1
+        QPEL_H_HEADER
+        mov             x10, #MAX_PB_SIZE * 2
+        add             x15, x0, #8
+1:
+        ld1             {v0.16b}, [x1], x2
+        ext             v1.16b, v0.16b, v0.16b, #1
+        ext             v2.16b, v0.16b, v0.16b, #2
+        ext             v3.16b, v0.16b, v0.16b, #3
+        ext             v4.16b, v0.16b, v0.16b, #4
+        ext             v5.16b, v0.16b, v0.16b, #5
+        zip1            v0.2d, v0.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        zip1            v4.2d, v4.2d, v5.2d
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        usdot           v16.4s, v0.16b, v31.16b
+        usdot           v17.4s, v2.16b, v31.16b
+        usdot           v18.4s, v4.16b, v31.16b
+        addp            v16.4s, v16.4s, v17.4s
+        addp            v18.4s, v18.4s, v18.4s
+        sqxtn           v16.4h, v16.4s
+        sqxtn           v18.4h, v18.4s
+        str             d16, [x0]
+        str             s18, [x15]
+        add             x0, x0, x10
+        add             x15, x15, x10
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h8_8_neon_dotprod, export=1
+        QPEL_H_HEADER
+        mov             x10, #MAX_PB_SIZE * 2
+1:
+        ld1             {v0.16b}, [x1], x2
+        ext             v1.16b, v0.16b, v0.16b, #1
+        ext             v2.16b, v0.16b, v0.16b, #2
+        ext             v3.16b, v0.16b, v0.16b, #3
+        ext             v4.16b, v0.16b, v0.16b, #4
+        ext             v5.16b, v0.16b, v0.16b, #5
+        ext             v6.16b, v0.16b, v0.16b, #6
+        ext             v7.16b, v0.16b, v0.16b, #7
+        zip1            v0.2d, v0.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        zip1            v4.2d, v4.2d, v5.2d
+        zip1            v6.2d, v6.2d, v7.2d
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        movi            v19.2d, #0
+        usdot           v16.4s, v0.16b, v31.16b
+        usdot           v17.4s, v2.16b, v31.16b
+        usdot           v18.4s, v4.16b, v31.16b
+        usdot           v19.4s, v6.16b, v31.16b
+        addp            v16.4s, v16.4s, v17.4s
+        addp            v18.4s, v18.4s, v19.4s
+        sqxtn           v16.4h, v16.4s
+        sqxtn2          v16.8h, v18.4s
+        str             q16, [x0]
+        add             x0, x0, x10
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+.macro QPEL_H_CALC s0, s1, s2, s3, d0, d1, d2, d3 +        movi 
    \d0\().2d, #0
+        movi            \d1\().2d, #0
+        movi            \d2\().2d, #0
+        movi            \d3\().2d, #0
+        usdot           \d0\().4s, \s0\().16b, v31.16b
+        usdot           \d1\().4s, \s1\().16b, v31.16b
+        usdot           \d2\().4s, \s2\().16b, v31.16b
+        usdot           \d3\().4s, \s3\().16b, v31.16b
+.endm
+
+function ff_hevc_put_hevc_qpel_h12_8_neon_dotprod, export=1
+        QPEL_H_HEADER
+        mov             x10, #MAX_PB_SIZE * 2
+        add             x15, x0, #16
+1:
+        ld1             {v16.16b, v17.16b}, [x1], x2
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        zip1            v18.2d, v4.2d, v5.2d
+        zip1            v19.2d, v6.2d, v7.2d
+        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        movi            v24.2d, #0
+        movi            v25.2d, #0
+        usdot           v24.4s, v18.16b, v31.16b
+        usdot           v25.4s, v19.16b, v31.16b
+        addp            v24.4s, v24.4s, v25.4s
+        trn1            v26.4s, v20.4s, v21.4s
+        trn2            v27.4s, v20.4s, v21.4s
+        sqxtn           v26.4h, v26.4s
+        sqxtn           v27.4h, v27.4s
+        sqxtn2          v26.8h, v24.4s
+
+        str             q26, [x0]
+        str             d27, [x15]
+        add             x0, x0, x10
+        add             x15, x15, x10
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h16_8_neon_dotprod, export=1
+        QPEL_H_HEADER
+        mov             x10, #MAX_PB_SIZE * 2
+1:
+        ld1             {v16.16b, v17.16b}, [x1], x2
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+
+        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+
+        sqxtn           v18.4h, v22.4s
+        sqxtn2          v18.8h, v26.4s
+        sqxtn           v19.4h, v23.4s
+        sqxtn2          v19.8h, v27.4s
+        +        stp             q18, q19, [x0]
+        add             x0, x0, x10
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h24_8_neon_dotprod, export=1
+        QPEL_H_HEADER
+        mov             x10, #MAX_PB_SIZE * 2
+        add             x15, x0, #32
+1:
+        ld1             {v16.16b, v17.16b}, [x1], x2
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v18.4h, v22.4s
+        sqxtn2          v18.8h, v26.4s
+        sqxtn           v19.4h, v23.4s
+        sqxtn2          v19.8h, v27.4s
+        stp             q18, q19, [x0]
+        add             x0, x0, x10
+        ext             v1.16b, v17.16b, v17.16b, #1
+        ext             v2.16b, v17.16b, v17.16b, #2
+        ext             v3.16b, v17.16b, v17.16b, #3
+        ext             v4.16b, v17.16b, v17.16b, #4
+        ext             v5.16b, v17.16b, v17.16b, #5
+        ext             v6.16b, v17.16b, v17.16b, #6
+        ext             v7.16b, v17.16b, v17.16b, #7
+        zip1            v0.2d, v17.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        zip1            v4.2d, v4.2d, v5.2d
+        zip1            v6.2d, v6.2d, v7.2d
+        QPEL_H_CALC     v0, v2, v4, v5, v20, v21, v22, v23
+        addp            v20.4s, v20.4s, v21.4s
+        addp            v22.4s, v22.4s, v23.4s
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        str             q20, [x15]
+        add             x15, x15, x10
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h32_8_neon_dotprod, export=1
+        QPEL_H_HEADER
+        mov             x10, #MAX_PB_SIZE * 2
+        add             x15, x0, #32
+1:
+        ld1             {v16.16b, v17.16b, v18.16b}, [x1], x2
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x0]
+        add             x0, x0, x10
+        ext             v1.16b, v17.16b, v18.16b, #1
+        ext             v2.16b, v17.16b, v18.16b, #2
+        ext             v3.16b, v17.16b, v18.16b, #3
+        ext             v4.16b, v17.16b, v18.16b, #4
+        ext             v5.16b, v17.16b, v18.16b, #5
+        ext             v6.16b, v17.16b, v18.16b, #6
+        ext             v7.16b, v17.16b, v18.16b, #7
+        QPEL_H_CALC     v17, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x15]
+        add             x15, x15, x10
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h48_8_neon_dotprod, export=1
+        QPEL_H_HEADER
+        mov             x10, #MAX_PB_SIZE * 2 - 64
+1:
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x0], #32
+
+        ext             v1.16b, v17.16b, v18.16b, #1
+        ext             v2.16b, v17.16b, v18.16b, #2
+        ext             v3.16b, v17.16b, v18.16b, #3
+        ext             v4.16b, v17.16b, v18.16b, #4
+        ext             v5.16b, v17.16b, v18.16b, #5
+        ext             v6.16b, v17.16b, v18.16b, #6
+        ext             v7.16b, v17.16b, v18.16b, #7
+        QPEL_H_CALC     v17, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x0], #32
+        ext             v1.16b, v18.16b, v19.16b, #1
+        ext             v2.16b, v18.16b, v19.16b, #2
+        ext             v3.16b, v18.16b, v19.16b, #3
+        ext             v4.16b, v18.16b, v19.16b, #4
+        ext             v5.16b, v18.16b, v19.16b, #5
+        ext             v6.16b, v18.16b, v19.16b, #6
+        ext             v7.16b, v18.16b, v19.16b, #7
+        QPEL_H_CALC     v18, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x0]
+        add             x0, x0, x10
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h64_8_neon_dotprod, export=1
+        QPEL_H_HEADER
+        sub             x2, x2, #64
+1:
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x0], #32
+
+        ext             v1.16b, v17.16b, v18.16b, #1
+        ext             v2.16b, v17.16b, v18.16b, #2
+        ext             v3.16b, v17.16b, v18.16b, #3
+        ext             v4.16b, v17.16b, v18.16b, #4
+        ext             v5.16b, v17.16b, v18.16b, #5
+        ext             v6.16b, v17.16b, v18.16b, #6
+        ext             v7.16b, v17.16b, v18.16b, #7
+        QPEL_H_CALC     v17, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x0], #32
+        ext             v1.16b, v18.16b, v19.16b, #1
+        ext             v2.16b, v18.16b, v19.16b, #2
+        ext             v3.16b, v18.16b, v19.16b, #3
+        ext             v4.16b, v18.16b, v19.16b, #4
+        ext             v5.16b, v18.16b, v19.16b, #5
+        ext             v6.16b, v18.16b, v19.16b, #6
+        ext             v7.16b, v18.16b, v19.16b, #7
+        QPEL_H_CALC     v18, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x0], #32
+        ld1             {v28.8b}, [x1], x2
+        ext             v1.16b, v19.16b, v28.16b, #1
+        ext             v2.16b, v19.16b, v28.16b, #2
+        ext             v3.16b, v19.16b, v28.16b, #3
+        ext             v4.16b, v19.16b, v28.16b, #4
+        ext             v5.16b, v19.16b, v28.16b, #5
+        ext             v6.16b, v19.16b, v28.16b, #6
+        ext             v7.16b, v19.16b, v28.16b, #7
+        QPEL_H_CALC     v19, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x0], #32
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+.macro QPEL_UNI_W_HV_HEADER width
+        ldp             x14, x15, [sp]          // mx, my
+        ldr             w13, [sp, #16]          // width
+        stp             x20, x21, [sp, #-16]!
+        stp             x22, x23, [sp, #-16]!
+        stp             x24, x25, [sp, #-16]!
+        stp             x26, x27, [sp, #-16]!
+        stp             x28, x30, [sp, #-16]!
+        mov             x28, sp
+        mov             x11, #9088
+        sub             sp, sp, x11
+        mov             x20, x0
+        mov             x21, x1
+        mov             x0, sp
+        sub             x1, x2, x3, lsl 1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             w3, w4, #7
+        mov             w22, w4                 // height
+        mov             x4, x14                 // mx
+        mov             x23, x15                // my
+        mov             w24, w6                 // wx
+        mov             w25, w7                 // ox
+        mov             w26, #-6
+        sub             w26, w26, w5            // -shift
+        mov             w27, w13                // width
+        bl              X(ff_hevc_put_hevc_qpel_h\width\()_8_neon_dotprod)
+        movrel          x9, qpel_filters
+        add             x9, x9, x23, lsl 3
+        ld1             {v0.8b}, [x9]
+        sxtl            v0.8h, v0.8b
+        mov             x10, #(MAX_PB_SIZE * 2)
+        dup             v28.4s, w24
+        dup             v29.4s, w25
+        dup             v30.4s, w26
+.endm
+
+.macro QPEL_UNI_W_HV_END
+        mov             sp, x28
+        ldp             x28, x30, [sp], #16
+        ldp             x26, x27, [sp], #16
+        ldp             x24, x25, [sp], #16
+        ldp             x22, x23, [sp], #16
+        ldp             x20, x21, [sp], #16
+.endm
+
+.macro QPEL_UNI_W_HV_4
+        sshr            v26.4s, v26.4s, #6
+        mul             v24.4s, v26.4s, v28.4s
+        sqrshl          v24.4s, v24.4s, v30.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtun          v24.8b, v24.8h
+        st1             {v24.s}[0], [x20], x21
+.endm
+
+.macro QPEL_FILTER_H    dst, src0, src1, src2, src3, src4, src5, src6, src7
+        smull           \dst\().4s, \src0\().4h, v0.h[0]
+        smlal           \dst\().4s, \src1\().4h, v0.h[1]
+        smlal           \dst\().4s, \src2\().4h, v0.h[2]
+        smlal           \dst\().4s, \src3\().4h, v0.h[3]
+        smlal           \dst\().4s, \src4\().4h, v0.h[4]
+        smlal           \dst\().4s, \src5\().4h, v0.h[5]
+        smlal           \dst\().4s, \src6\().4h, v0.h[6]
+        smlal           \dst\().4s, \src7\().4h, v0.h[7]
+.endm
+
+.macro QPEL_FILTER_H2    dst, src0, src1, src2, src3, src4, src5, src6, 
src7
+        smull2          \dst\().4s, \src0\().8h, v0.h[0]
+        smlal2          \dst\().4s, \src1\().8h, v0.h[1]
+        smlal2          \dst\().4s, \src2\().8h, v0.h[2]
+        smlal2          \dst\().4s, \src3\().8h, v0.h[3]
+        smlal2          \dst\().4s, \src4\().8h, v0.h[4]
+        smlal2          \dst\().4s, \src5\().8h, v0.h[5]
+        smlal2          \dst\().4s, \src6\().8h, v0.h[6]
+        smlal2          \dst\().4s, \src7\().8h, v0.h[7]
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_hv4_8_neon_dotprod, export=1
+        QPEL_UNI_W_HV_HEADER 4
+        ldr             d16, [sp]
+        ldr             d17, [sp, x10]
+        add             sp, sp, x10, lsl 1
+        ldr             d18, [sp]
+        ldr             d19, [sp, x10]
+        add             sp, sp, x10, lsl 1
+        ldr             d20, [sp]
+        ldr             d21, [sp, x10]
+        add             sp, sp, x10, lsl 1
+        ldr             d22, [sp]
+        add             sp, sp, x10
+1:
+        ldr             d23, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_HV_4
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             d16, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_HV_4
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             d17, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_HV_4
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             d18, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_HV_4
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             d19, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_HV_4
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             d20, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_HV_4
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             d21, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_HV_4
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             d22, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_HV_4
+        subs            w22, w22, #1
+        b.hi            1b
+
+2:
+        QPEL_UNI_W_HV_END
+        ret
+endfunc
+
+.macro QPEL_UNI_W_HV_8
+        sshr            v26.4s, v26.4s, #6
+        sshr            v27.4s, v27.4s, #6
+        mul             v24.4s, v26.4s, v28.4s
+        mul             v25.4s, v27.4s, v28.4s
+        sqrshl          v24.4s, v24.4s, v30.4s
+        sqrshl          v25.4s, v25.4s, v30.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqadd           v25.4s, v25.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtn2          v24.8h, v25.4s
+        sqxtun          v24.8b, v24.8h
+        st1             {v24.d}[0], [x20], x21
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_hv8_8_neon_dotprod, export=1
+        QPEL_UNI_W_HV_HEADER 8
+        ldr             q16, [sp]
+        ldr             q17, [sp, x10]
+        add             sp, sp, x10, lsl 1
+        ldr             q18, [sp]
+        ldr             q19, [sp, x10]
+        add             sp, sp, x10, lsl 1
+        ldr             q20, [sp]
+        ldr             q21, [sp, x10]
+        add             sp, sp, x10, lsl 1
+        ldr             q22, [sp]
+        add             sp, sp, x10
+1:
+        ldr             q23, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_H2  v27, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_HV_8
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             q16, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_H2  v27, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_HV_8
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             q17, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_H2  v27, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_HV_8
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             q18, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_H2  v27, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_HV_8
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             q19, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_H2  v27, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_HV_8
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             q20, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_H2  v27, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_HV_8
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             q21, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_H2  v27, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_HV_8
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             q22, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_H2  v27, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_HV_8
+        subs            w22, w22, #1
+        b.hi            1b
+
+2:
+        QPEL_UNI_W_HV_END
+        ret
+endfunc
+
+.macro QPEL_UNI_W_HV_16
+        sshr            v24.4s, v24.4s, #6
+        sshr            v25.4s, v25.4s, #6
+        sshr            v26.4s, v26.4s, #6
+        sshr            v27.4s, v27.4s, #6
+        mul             v24.4s, v24.4s, v28.4s
+        mul             v25.4s, v25.4s, v28.4s
+        mul             v26.4s, v26.4s, v28.4s
+        mul             v27.4s, v27.4s, v28.4s
+        sqrshl          v24.4s, v24.4s, v30.4s
+        sqrshl          v25.4s, v25.4s, v30.4s
+        sqrshl          v26.4s, v26.4s, v30.4s
+        sqrshl          v27.4s, v27.4s, v30.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqadd           v25.4s, v25.4s, v29.4s
+        sqadd           v26.4s, v26.4s, v29.4s
+        sqadd           v27.4s, v27.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtn2          v24.8h, v25.4s
+        sqxtn           v26.4h, v26.4s
+        sqxtn2          v26.8h, v27.4s
+        sqxtun          v24.8b, v24.8h
+        sqxtun2         v24.16b, v26.8h
+
+        st1             {v24.16b}, [x20], x21
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_hv16_8_neon_dotprod, export=1
+        QPEL_UNI_W_HV_HEADER 16
+        ldp             q16, q1, [sp]
+        add             sp, sp, x10
+        ldp             q17, q2, [sp]
+        add             sp, sp, x10
+        ldp             q18, q3, [sp]
+        add             sp, sp, x10
+        ldp             q19, q4, [sp]
+        add             sp, sp, x10
+        ldp             q20, q5, [sp]
+        add             sp, sp, x10
+        ldp             q21, q6, [sp]
+        add             sp, sp, x10
+        ldp             q22, q7, [sp]
+        add             sp, sp, x10
+1:
+        ldp             q23, q31, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_H2  v25, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_H   v26,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
+        QPEL_FILTER_H2  v27,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q16, q1, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_H2  v25, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_H   v26,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
+        QPEL_FILTER_H2  v27,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q17, q2, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_H2  v25, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_H   v26,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
+        QPEL_FILTER_H2  v27,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q18, q3, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_H2  v25, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_H   v26,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
+        QPEL_FILTER_H2  v27,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q19, q4, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_H2  v25, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_H   v26,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
+        QPEL_FILTER_H2  v27,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q20, q5, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_H2  v25, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_H   v26,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
+        QPEL_FILTER_H2  v27,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q21, q6, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_H2  v25, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_H   v26,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
+        QPEL_FILTER_H2  v27,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q22, q7, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_H2  v25, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_H   v26, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
+        QPEL_FILTER_H2  v27, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.hi            1b
+
+2:
+        QPEL_UNI_W_HV_END
+        ret
+endfunc
+
+
+function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_dotprod, export=1
+        QPEL_UNI_W_HV_HEADER 32
+        mov             x11, sp
+        mov             w12, w22
+        mov             x13, x20
+3:
+        ldp             q16, q1, [sp]
+        add             sp, sp, x10
+        ldp             q17, q2, [sp]
+        add             sp, sp, x10
+        ldp             q18, q3, [sp]
+        add             sp, sp, x10
+        ldp             q19, q4, [sp]
+        add             sp, sp, x10
+        ldp             q20, q5, [sp]
+        add             sp, sp, x10
+        ldp             q21, q6, [sp]
+        add             sp, sp, x10
+        ldp             q22, q7, [sp]
+        add             sp, sp, x10
+1:
+        ldp             q23, q31, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_H2  v25, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_H   v26,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
+        QPEL_FILTER_H2  v27,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q16, q1, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_H2  v25, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_H   v26,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
+        QPEL_FILTER_H2  v27,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q17, q2, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_H2  v25, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_H   v26,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
+        QPEL_FILTER_H2  v27,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q18, q3, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_H2  v25, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_H   v26,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
+        QPEL_FILTER_H2  v27,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q19, q4, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_H2  v25, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_H   v26,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
+        QPEL_FILTER_H2  v27,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q20, q5, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_H2  v25, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_H   v26,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
+        QPEL_FILTER_H2  v27,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q21, q6, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_H2  v25, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_H   v26,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
+        QPEL_FILTER_H2  v27,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q22, q7, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_H2  v25, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_H   v26, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
+        QPEL_FILTER_H2  v27, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.hi            1b
+2:
+        subs            w27, w27, #16
+        add             sp, x11, #32
+        add             x20, x13, #16
+        mov             w22, w12
+        mov             x11, sp
+        mov             x13, x20
+        b.hi            3b
+        QPEL_UNI_W_HV_END
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_dotprod, export=1
+        QPEL_UNI_W_HV_HEADER 64
+        mov             x11, sp
+        mov             w12, w22
+        mov             x13, x20
+3:
+        ldp             q16, q1, [sp]
+        add             sp, sp, x10
+        ldp             q17, q2, [sp]
+        add             sp, sp, x10
+        ldp             q18, q3, [sp]
+        add             sp, sp, x10
+        ldp             q19, q4, [sp]
+        add             sp, sp, x10
+        ldp             q20, q5, [sp]
+        add             sp, sp, x10
+        ldp             q21, q6, [sp]
+        add             sp, sp, x10
+        ldp             q22, q7, [sp]
+        add             sp, sp, x10
+1:
+        ldp             q23, q31, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_H2  v25, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_H   v26,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
+        QPEL_FILTER_H2  v27,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q16, q1, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_H2  v25, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_H   v26,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
+        QPEL_FILTER_H2  v27,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q17, q2, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_H2  v25, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_H   v26,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
+        QPEL_FILTER_H2  v27,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q18, q3, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_H2  v25, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_H   v26,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
+        QPEL_FILTER_H2  v27,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q19, q4, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_H2  v25, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_H   v26,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
+        QPEL_FILTER_H2  v27,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q20, q5, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_H2  v25, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_H   v26,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
+        QPEL_FILTER_H2  v27,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q21, q6, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_H2  v25, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_H   v26,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
+        QPEL_FILTER_H2  v27,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q22, q7, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_H2  v25, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_H   v26, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
+        QPEL_FILTER_H2  v27, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.hi            1b
+2:
+        subs            w27, w27, #16
+        add             sp, x11, #32
+        add             x20, x13, #16
+        mov             w22, w12
+        mov             x11, sp
+        mov             x13, x20
+        b.hi            3b
+        QPEL_UNI_W_HV_END
+        ret
+endfunc
+
+#endif // __ARM_FEATURE_DOTPROD
\ No newline at end of file

Comments

Jean-Baptiste Kempf May 2, 2023, 12:32 p.m. UTC | #1
Hello,

Just 2 questions:
- could you split this patch into several (3,4 or 5)
- are all those functions checked by chekasm?

Thanks,

jb

On Sun, 30 Apr 2023, at 10:57, myais wrote:
> Hi,
> This is a patch for the aarch64, which completes the neon versions of 
> the hevc_pel_uni_w_pixels, qpel_uni_w_h, qpel_uni_w_v, qpel_uni_w_hv 
> interfaces.
>
> put_hevc_pel_uni_w_pixels4_8_c: 54.3
> put_hevc_pel_uni_w_pixels4_8_neon: 24.1
> put_hevc_pel_uni_w_pixels6_8_c: 105.3
> put_hevc_pel_uni_w_pixels6_8_neon: 53.1
> put_hevc_pel_uni_w_pixels8_8_c: 176.6
> put_hevc_pel_uni_w_pixels8_8_neon: 63.8
> put_hevc_pel_uni_w_pixels12_8_c: 391.1
> put_hevc_pel_uni_w_pixels12_8_neon: 193.3
> put_hevc_pel_uni_w_pixels16_8_c: 688.1
> put_hevc_pel_uni_w_pixels16_8_neon: 226.1
> put_hevc_pel_uni_w_pixels24_8_c: 1542.3
> put_hevc_pel_uni_w_pixels24_8_neon: 536.8
> put_hevc_pel_uni_w_pixels32_8_c: 2753.1
> put_hevc_pel_uni_w_pixels32_8_neon: 875.8
> put_hevc_pel_uni_w_pixels48_8_c: 6251.1
> put_hevc_pel_uni_w_pixels48_8_neon: 1966.1
> put_hevc_pel_uni_w_pixels64_8_c: 11047.1
> put_hevc_pel_uni_w_pixels64_8_neon: 3449.8
>
> put_hevc_qpel_uni_w_h4_8_c: 156.6
> put_hevc_qpel_uni_w_h4_8_neon: 44.6
> put_hevc_qpel_uni_w_h6_8_c: 324.6
> put_hevc_qpel_uni_w_h6_8_neon: 103.1
> put_hevc_qpel_uni_w_h8_8_c: 549.3
> put_hevc_qpel_uni_w_h8_8_neon: 138.6
> put_hevc_qpel_uni_w_h12_8_c: 1240.3
> put_hevc_qpel_uni_w_h12_8_neon: 277.3
> put_hevc_qpel_uni_w_h16_8_c: 2161.8
> put_hevc_qpel_uni_w_h16_8_neon: 394.1
> put_hevc_qpel_uni_w_h24_8_c: 4874.8
> put_hevc_qpel_uni_w_h24_8_neon: 972.6
> put_hevc_qpel_uni_w_h32_8_c: 8517.8
> put_hevc_qpel_uni_w_h32_8_neon: 1517.3
> put_hevc_qpel_uni_w_h48_8_c: 19856.1
> put_hevc_qpel_uni_w_h48_8_neon: 3429.8
> put_hevc_qpel_uni_w_h64_8_c: 35159.3
> put_hevc_qpel_uni_w_h64_8_neon: 6018.1
>
> put_hevc_qpel_uni_w_v4_8_c: 180.6
> put_hevc_qpel_uni_w_v4_8_neon: 63.8
> put_hevc_qpel_uni_w_v6_8_c: 318.6
> put_hevc_qpel_uni_w_v6_8_neon: 117.8
> put_hevc_qpel_uni_w_v8_8_c: 547.6
> put_hevc_qpel_uni_w_v8_8_neon: 132.1
> put_hevc_qpel_uni_w_v12_8_c: 1202.8
> put_hevc_qpel_uni_w_v12_8_neon: 350.1
> put_hevc_qpel_uni_w_v16_8_c: 2109.6
> put_hevc_qpel_uni_w_v16_8_neon: 442.1
> put_hevc_qpel_uni_w_v24_8_c: 4748.8
> put_hevc_qpel_uni_w_v24_8_neon: 1287.1
> put_hevc_qpel_uni_w_v32_8_c: 8487.3
> put_hevc_qpel_uni_w_v32_8_neon: 1704.3
> put_hevc_qpel_uni_w_v48_8_c: 18798.8
> put_hevc_qpel_uni_w_v48_8_neon: 3790.8
> put_hevc_qpel_uni_w_v64_8_c: 35614.6
> put_hevc_qpel_uni_w_v64_8_neon: 6725.6
>
>
> put_hevc_qpel_uni_w_hv4_8_c: 498.8
> put_hevc_qpel_uni_w_hv4_8_neon: 139.3
> put_hevc_qpel_uni_w_hv6_8_c: 874.6
> put_hevc_qpel_uni_w_hv6_8_neon: 295.3
> put_hevc_qpel_uni_w_hv8_8_c: 1372.1
> put_hevc_qpel_uni_w_hv8_8_neon: 387.1
> put_hevc_qpel_uni_w_hv12_8_c: 2721.8
> put_hevc_qpel_uni_w_hv12_8_neon: 804.8
> put_hevc_qpel_uni_w_hv16_8_c: 4503.1
> put_hevc_qpel_uni_w_hv16_8_neon: 1038.1
> put_hevc_qpel_uni_w_hv24_8_c: 9321.8
> put_hevc_qpel_uni_w_hv24_8_neon: 2962.1
> put_hevc_qpel_uni_w_hv32_8_c: 15926.8
> put_hevc_qpel_uni_w_hv32_8_neon: 3858.6
> put_hevc_qpel_uni_w_hv48_8_c: 35051.1
> put_hevc_qpel_uni_w_hv48_8_neon: 9301.1
> put_hevc_qpel_uni_w_hv64_8_c: 61215.3
> put_hevc_qpel_uni_w_hv64_8_neon: 14920.1
>
> put_hevc_qpel_uni_h4_8_c: 143.3
> put_hevc_qpel_uni_h4_8_neon: 55.3
> put_hevc_qpel_uni_h6_8_c: 304.6
> put_hevc_qpel_uni_h6_8_neon: 82.3
> put_hevc_qpel_uni_h8_8_c: 557.8
> put_hevc_qpel_uni_h8_8_neon: 99.3
> put_hevc_qpel_uni_h12_8_c: 1228.3
> put_hevc_qpel_uni_h12_8_neon: 251.6
> put_hevc_qpel_uni_h16_8_c: 2210.3
> put_hevc_qpel_uni_h16_8_neon: 324.6
> put_hevc_qpel_uni_h24_8_c: 4859.1
> put_hevc_qpel_uni_h24_8_neon: 962.3
> put_hevc_qpel_uni_h32_8_c: 8728.6
> put_hevc_qpel_uni_h32_8_neon: 1249.6
> put_hevc_qpel_uni_h48_8_c: 20346.3
> put_hevc_qpel_uni_h48_8_neon: 2824.1
> put_hevc_qpel_uni_h64_8_c: 36702.6
> put_hevc_qpel_uni_h64_8_neon: 5012.1
>
>
>
>
> Signed-off-by: myais <Logan.Lyu@myais.com.cn>
> ---
>   libavcodec/aarch64/hevcdsp_init_aarch64.c |   96 +
>   libavcodec/aarch64/hevcdsp_qpel_neon.S    | 2223 +++++++++++++++++++++
>   2 files changed, 2319 insertions(+)
>
> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
> b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> index be1049a2ec..42b8e9169d 100644
> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> @@ -128,6 +128,91 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t 
> *_dst, ptrdiff_t _dststride, co
>                                            ptrdiff_t _srcstride, const 
> int16_t *src2, int height, intptr_t
>                                            mx, intptr_t my, int width);
>   +#define NEON8_FNPROTO(fn, args, ext) \
> +    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
> +    void ff_hevc_put_hevc_##fn##6_8_neon##ext args; \
> +    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
> +    void ff_hevc_put_hevc_##fn##12_8_neon##ext args; \
> +    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
> +    void ff_hevc_put_hevc_##fn##24_8_neon##ext args; \
> +    void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
> +    void ff_hevc_put_hevc_##fn##48_8_neon##ext args; \
> +    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
> +
> +#define NEON8_FNPROTO_PARTIAL_4(fn, args, ext) \
> +    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
> +    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
> +    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
> +    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
> +
> +#define NEON8_FNPROTO_PARTIAL_5(fn, args, ext) \
> +    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
> +    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
> +    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
> +    void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
> +    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
> +
> +
> +NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
> +        const uint8_t *_src, ptrdiff_t _srcstride,
> +        int height, int denom, int wx, int ox, +        intptr_t mx, 
> intptr_t my, int width),);
> +
> +NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t 
> _dststride,
> +        const uint8_t *_src, ptrdiff_t _srcstride,
> +        int height, int denom, int wx, int ox,
> +        intptr_t mx, intptr_t my, int width),);
> +
> +#if defined(__ARM_FEATURE_DOTPROD)
> +NEON8_FNPROTO(qpel_h, (int16_t *dst,
> +        const uint8_t *_src, ptrdiff_t _srcstride,
> +        int height, intptr_t mx, intptr_t my, int width), _dotprod);
> +
> +NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
> +        const uint8_t *_src, ptrdiff_t _srcstride,
> +        int height, int denom, int wx, int ox,
> +        intptr_t mx, intptr_t my, int width), _dotprod);
> +
> +NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t 
> _dststride,
> +        const uint8_t *_src, ptrdiff_t _srcstride,
> +        int height, int denom, int wx, int ox,
> +        intptr_t mx, intptr_t my, int width), _dotprod);
> +
> +#endif
> +
> +#define NEON8_FNASSIGN(member, v, h, fn, ext) \
> +        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
> +        member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext;  \
> +        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
> +        member[4][v][h] = ff_hevc_put_hevc_##fn##12_8_neon##ext; \
> +        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
> +        member[6][v][h] = ff_hevc_put_hevc_##fn##24_8_neon##ext; \
> +        member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
> +        member[8][v][h] = ff_hevc_put_hevc_##fn##48_8_neon##ext; \
> +        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
> +
> +#define NEON8_FNASSIGN_PARTIAL_4(member, v, h, fn, ext) \
> +        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
> +        member[2][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
> +        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
> +        member[4][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
> +        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
> +        member[6][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
> +        member[7][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
> +        member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
> +        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
> +
> +#define NEON8_FNASSIGN_PARTIAL_5(member, v, h, fn, ext) \
> +        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
> +        member[2][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
> +        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
> +        member[4][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
> +        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
> +        member[6][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
> +        member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
> +        member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
> +        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
> +
>   av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int 
> bit_depth)
>   {
>       if (!have_neon(av_get_cpu_flags())) return;
> @@ -185,6 +270,17 @@ av_cold void 
> ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
>           c->put_hevc_qpel_bi[7][0][1]   =
>           c->put_hevc_qpel_bi[8][0][1]   =
>           c->put_hevc_qpel_bi[9][0][1]   = 
> ff_hevc_put_hevc_qpel_bi_h16_8_neon;
> +
> +        NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
> +        NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, 
> pel_uni_w_pixels,); + 
> NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
> +
> +    #if defined(__ARM_FEATURE_DOTPROD)
> +        NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _dotprod);
> +        NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, 
> _dotprod);
> +        NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, 
> qpel_uni_w_hv, _dotprod);
> +
> +    #endif
>       }
>       if (bit_depth == 10) {
>           c->hevc_h_loop_filter_chroma   = 
> ff_hevc_h_loop_filter_chroma_10_neon;
> diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S 
> b/libavcodec/aarch64/hevcdsp_qpel_neon.S
> index 0e7b912678..e30ac1b465 100644
> --- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
> @@ -30,6 +30,13 @@ const qpel_filters, align=4
>           .byte           0,  1, -5, 17, 58,-10, 4, -1
>   endconst
>   +const qpel_filters_abs, align=4
> +        .byte           0,  0,  0,  0,  0,  0, 0,  0
> +        .byte           1,  4, 10, 58, 17,  5, 1,  0
> +        .byte           1,  4, 11, 40, 40, 11, 4,  1
> +        .byte           0,  1,  5, 17, 58, 10, 4,  1
> +endconst
> +
>   .macro load_filter m
>           movrel          x15, qpel_filters
>           add             x15, x15, \m, lsl #3
> @@ -482,3 +489,2219 @@ endfunc
>   put_hevc qpel
>   put_hevc qpel_uni
>   put_hevc qpel_bi
> +
> +
> +function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
> +        mov     w10, #-6
> +        sub     w10, w10, w5
> +        dup     v30.8h, w6
> +        dup     v31.4s, w10
> +        dup     v29.8h, w7
> +1:
> +        ldr     s0, [x2]
> +        ldr     s1, [x2, x3]
> +        add     x2, x2, x3, lsl 1
> +        ushll   v0.8h, v0.8b, #6
> +        ushll   v1.8h, v1.8b, #6
> +        smull   v0.4s, v0.4h, v30.4h
> +        smull   v1.4s, v1.4h, v30.4h
> +        sqrshl  v0.4s, v0.4s, v31.4s
> +        sqrshl  v1.4s, v1.4s, v31.4s
> +        sqadd   v0.4s, v0.4s, v29.4s
> +        sqadd   v1.4s, v1.4s, v29.4s
> +        sqxtn  v0.4h, v0.4s
> +        sqxtn  v1.4h, v1.4s
> +        sqxtun  v0.8b, v0.8h
> +        sqxtun  v1.8b, v1.8h
> +        str     s0, [x0]
> +        str     s1, [x0, x1]
> +        add     x0, x0, x1, lsl 1
> +        subs    w4, w4, #2
> +        b.ne    1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_pel_uni_w_pixels6_8_neon, export=1
> +        mov     w10, #-6
> +        sub     w10, w10, w5
> +        dup     v30.8h, w6
> +        dup     v31.4s, w10
> +        dup     v29.4s, w7
> +        sub     x1, x1, #4
> +1:
> +        ldr     d0, [x2]
> +        ldr     d1, [x2, x3]
> +        add     x2, x2, x3, lsl 1
> +        ushll   v0.8h, v0.8b, #6
> +        ushll   v1.8h, v1.8b, #6
> +        smull   v4.4s, v0.4h, v30.4h
> +        smull2  v5.4s, v0.8h, v30.8h
> +        smull   v6.4s, v1.4h, v30.4h
> +        smull2  v7.4s, v1.8h, v30.8h
> +        sqrshl  v4.4s, v4.4s, v31.4s
> +        sqrshl  v5.4s, v5.4s, v31.4s
> +        sqrshl  v6.4s, v6.4s, v31.4s
> +        sqrshl  v7.4s, v7.4s, v31.4s
> +        sqadd   v4.4s, v4.4s, v29.4s
> +        sqadd   v5.4s, v5.4s, v29.4s
> +        sqadd   v6.4s, v6.4s, v29.4s
> +        sqadd   v7.4s, v7.4s, v29.4s
> +        sqxtn   v0.4h, v4.4s
> +        sqxtn2  v0.8h, v5.4s
> +        sqxtn   v1.4h, v6.4s
> +        sqxtn2  v1.8h, v7.4s
> +        sqxtun  v0.8b, v0.8h
> +        sqxtun  v1.8b, v1.8h
> +        str     s0, [x0], #4
> +        st1     {v0.h}[2], [x0], x1
> +        str     s1, [x0], #4
> +        st1     {v1.h}[2], [x0], x1
> +        subs    w4, w4, #2
> +        b.ne    1b
> +        ret +endfunc
> +
> +function ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon, export=1
> +        mov     w10, #-6
> +        sub     w10, w10, w5
> +        dup     v30.8h, w6
> +        dup     v31.4s, w10
> +        dup     v29.4s, w7
> +1:
> +        ldr     d0, [x2]
> +        ldr     d1, [x2, x3]
> +        add     x2, x2, x3, lsl 1
> +        ushll   v0.8h, v0.8b, #6
> +        ushll   v1.8h, v1.8b, #6
> +        smull   v4.4s, v0.4h, v30.4h
> +        smull2  v5.4s, v0.8h, v30.8h
> +        smull   v6.4s, v1.4h, v30.4h
> +        smull2  v7.4s, v1.8h, v30.8h
> +        sqrshl  v4.4s, v4.4s, v31.4s
> +        sqrshl  v5.4s, v5.4s, v31.4s
> +        sqrshl  v6.4s, v6.4s, v31.4s
> +        sqrshl  v7.4s, v7.4s, v31.4s
> +        sqadd   v4.4s, v4.4s, v29.4s
> +        sqadd   v5.4s, v5.4s, v29.4s
> +        sqadd   v6.4s, v6.4s, v29.4s
> +        sqadd   v7.4s, v7.4s, v29.4s
> +        sqxtn   v0.4h, v4.4s
> +        sqxtn2  v0.8h, v5.4s
> +        sqxtn   v1.4h, v6.4s
> +        sqxtn2  v1.8h, v7.4s
> +        sqxtun  v0.8b, v0.8h
> +        sqxtun  v1.8b, v1.8h
> +        str     d0, [x0]
> +        str     d1, [x0, x1]
> +        add     x0, x0, x1, lsl 1
> +        subs    w4, w4, #2
> +        b.ne    1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_pel_uni_w_pixels12_8_neon, export=1
> +        mov     w10, #-6
> +        sub     w10, w10, w5
> +        dup     v30.8h, w6
> +        dup     v31.4s, w10
> +        dup     v29.4s, w7
> +        sub     x1, x1, #8
> +1:
> +        ldr     q0, [x2]
> +        ldr     q1, [x2, x3]
> +        add     x2, x2, x3, lsl 1
> +        ushll   v4.8h, v0.8b, #6
> +        ushll2  v5.8h, v0.16b, #6
> +        ushll   v6.8h, v1.8b, #6
> +        ushll2  v7.8h, v1.16b, #6
> +        smull   v16.4s, v4.4h, v30.4h
> +        smull2  v17.4s, v4.8h, v30.8h
> +        smull   v18.4s, v5.4h, v30.4h
> +        smull2  v19.4s, v5.8h, v30.8h
> +        smull   v20.4s, v6.4h, v30.4h
> +        smull2  v21.4s, v6.8h, v30.8h
> +        smull   v22.4s, v7.4h, v30.4h
> +        smull2  v23.4s, v7.8h, v30.8h
> +        +        sqrshl  v16.4s, v16.4s, v31.4s
> +        sqrshl  v17.4s, v17.4s, v31.4s
> +        sqrshl  v18.4s, v18.4s, v31.4s
> +        sqrshl  v19.4s, v19.4s, v31.4s
> +        sqrshl  v20.4s, v20.4s, v31.4s
> +        sqrshl  v21.4s, v21.4s, v31.4s
> +        sqrshl  v22.4s, v22.4s, v31.4s
> +        sqrshl  v23.4s, v23.4s, v31.4s
> +        sqadd   v16.4s, v16.4s, v29.4s
> +        sqadd   v17.4s, v17.4s, v29.4s
> +        sqadd   v18.4s, v18.4s, v29.4s
> +        sqadd   v19.4s, v19.4s, v29.4s
> +        sqadd   v20.4s, v20.4s, v29.4s
> +        sqadd   v21.4s, v21.4s, v29.4s
> +        sqadd   v22.4s, v22.4s, v29.4s
> +        sqadd   v23.4s, v23.4s, v29.4s
> +        sqxtn   v0.4h, v16.4s
> +        sqxtn2  v0.8h, v17.4s
> +        sqxtn   v1.4h, v18.4s
> +        sqxtn2  v1.8h, v19.4s
> +        sqxtn   v2.4h, v20.4s
> +        sqxtn2  v2.8h, v21.4s
> +        sqxtn   v3.4h, v22.4s
> +        sqxtn2  v3.8h, v23.4s
> +        sqxtun  v0.8b, v0.8h
> +        sqxtun2 v0.16b, v1.8h
> +        sqxtun  v2.8b, v2.8h
> +        sqxtun2 v2.16b, v3.8h
> +        str     d0, [x0], #8
> +        st1     {v0.s}[2], [x0], x1
> +        str     d2, [x0], #8
> +        st1     {v2.s}[2], [x0], x1
> +        subs    w4, w4, #2
> +        b.ne    1b
> +        ret
> +endfunc
> +
> +.macro PEL_UNI_W_PIXEL_CALC     s0, t0, t1, d0, d1, d2, d3
> +        ushll   \t0\().8h, \s0\().8b, #6
> +        ushll2  \t1\().8h, \s0\().16b, #6
> +        smull   \d0\().4s, \t0\().4h, v30.4h
> +        smull2  \d1\().4s, \t0\().8h, v30.8h
> +        smull   \d2\().4s, \t1\().4h, v30.4h
> +        smull2  \d3\().4s, \t1\().8h, v30.8h
> +        sqrshl  \d0\().4s, \d0\().4s, v31.4s
> +        sqrshl  \d1\().4s, \d1\().4s, v31.4s
> +        sqrshl  \d2\().4s, \d2\().4s, v31.4s
> +        sqrshl  \d3\().4s, \d3\().4s, v31.4s
> +        sqadd   \d0\().4s, \d0\().4s, v29.4s
> +        sqadd   \d1\().4s, \d1\().4s, v29.4s
> +        sqadd   \d2\().4s, \d2\().4s, v29.4s
> +        sqadd   \d3\().4s, \d3\().4s, v29.4s
> +        sqxtn   \t0\().4h, \d0\().4s
> +        sqxtn2  \t0\().8h, \d1\().4s
> +        sqxtn   \t1\().4h, \d2\().4s
> +        sqxtn2  \t1\().8h, \d3\().4s
> +        sqxtun  \s0\().8b,  \t0\().8h
> +        sqxtun2 \s0\().16b, \t1\().8h
> +.endm
> +
> +
> +function ff_hevc_put_hevc_pel_uni_w_pixels16_8_neon, export=1
> +        mov     w10, #-6
> +        sub     w10, w10, w5
> +        dup     v30.8h, w6
> +        dup     v31.4s, w10
> +        dup     v29.4s, w7
> +1:
> +        ldr     q0, [x2]
> +        ldr     q1, [x2, x3]
> +        add     x2, x2, x3, lsl 1
> +        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
> +        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
> +        str     q0, [x0]
> +        str     q1, [x0, x1]
> +        add     x0, x0, x1, lsl 1
> +        subs    w4, w4, #2
> +        b.ne    1b
> +        ret
> +endfunc
> +
> +
> +
> +function ff_hevc_put_hevc_pel_uni_w_pixels24_8_neon, export=1
> +        mov     w10, #-6
> +        sub     w10, w10, w5
> +        dup     v30.8h, w6
> +        dup     v31.4s, w10
> +        dup     v29.4s, w7
> +1:
> +        ld1     {v0.16b, v1.16b}, [x2], x3
> +        ushll   v4.8h, v0.8b, #6
> +        ushll2  v5.8h, v0.16b, #6
> +        ushll   v6.8h, v1.8b, #6
> +        smull   v16.4s, v4.4h, v30.4h
> +        smull2  v17.4s, v4.8h, v30.8h
> +        smull   v18.4s, v5.4h, v30.4h
> +        smull2  v19.4s, v5.8h, v30.8h
> +        smull   v20.4s, v6.4h, v30.4h
> +        smull2  v21.4s, v6.8h, v30.8h
> +        sqrshl  v16.4s, v16.4s, v31.4s
> +        sqrshl  v17.4s, v17.4s, v31.4s
> +        sqrshl  v18.4s, v18.4s, v31.4s
> +        sqrshl  v19.4s, v19.4s, v31.4s
> +        sqrshl  v20.4s, v20.4s, v31.4s
> +        sqrshl  v21.4s, v21.4s, v31.4s
> +        sqadd   v16.4s, v16.4s, v29.4s
> +        sqadd   v17.4s, v17.4s, v29.4s
> +        sqadd   v18.4s, v18.4s, v29.4s
> +        sqadd   v19.4s, v19.4s, v29.4s
> +        sqadd   v20.4s, v20.4s, v29.4s
> +        sqadd   v21.4s, v21.4s, v29.4s
> +        sqxtn   v0.4h, v16.4s
> +        sqxtn2  v0.8h, v17.4s
> +        sqxtn   v1.4h, v18.4s
> +        sqxtn2  v1.8h, v19.4s
> +        sqxtn   v2.4h, v20.4s
> +        sqxtn2  v2.8h, v21.4s
> +        sqxtun  v0.8b, v0.8h
> +        sqxtun  v1.8b, v1.8h
> +        sqxtun  v2.8b, v2.8h
> +        st1     {v0.8b, v1.8b, v2.8b}, [x0], x1
> +        subs    w4, w4, #1
> +        b.ne    1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_pel_uni_w_pixels32_8_neon, export=1
> +        mov     w10, #-6
> +        sub     w10, w10, w5
> +        dup     v30.8h, w6
> +        dup     v31.4s, w10
> +        dup     v29.4s, w7
> +1:
> +        ld1     {v0.16b, v1.16b}, [x2], x3
> +        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
> +        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
> +        st1     {v0.16b, v1.16b}, [x0], x1
> +        subs    w4, w4, #1
> +        b.ne    1b
> +        ret
> +endfunc
> +
> +
> +function ff_hevc_put_hevc_pel_uni_w_pixels48_8_neon, export=1
> +        mov     w10, #-6
> +        sub     w10, w10, w5
> +        dup     v30.8h, w6
> +        dup     v31.4s, w10
> +        dup     v29.4s, w7
> +1:
> +        ld1     {v0.16b, v1.16b, v2.16b}, [x2], x3
> +        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
> +        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
> +        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
> +        st1     {v0.16b, v1.16b, v2.16b}, [x0], x1
> +        subs    w4, w4, #1
> +        b.ne    1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_pel_uni_w_pixels64_8_neon, export=1
> +        mov     w10, #-6
> +        sub     w10, w10, w5
> +        dup     v30.8h, w6
> +        dup     v31.4s, w10
> +        dup     v29.4s, w7
> +1:
> +        ld1     {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
> +        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
> +        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
> +        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
> +        PEL_UNI_W_PIXEL_CALC v3, v6, v7, v20, v21, v22, v23
> +        st1     {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
> +        subs    w4, w4, #1
> +        b.ne    1b
> +        ret
> +endfunc
> +
> +.macro QPEL_UNI_W_V_HEADER
> +        ldur            x12, [sp, #8]          // my
> +        sub             x2, x2, x3, lsl #1
> +        sub             x2, x2, x3
> +        movrel          x9, qpel_filters_abs
> +        add             x9, x9, x12, lsl 3
> +        ldr             d28, [x9]
> +        dup             v0.16b, v28.b[0]
> +        dup             v1.16b, v28.b[1]
> +        dup             v2.16b, v28.b[2]
> +        dup             v3.16b, v28.b[3]
> +        dup             v4.16b, v28.b[4]
> +        dup             v5.16b, v28.b[5]
> +        dup             v6.16b, v28.b[6]
> +        dup             v7.16b, v28.b[7]
> +
> +        mov             w10, #-6
> +        sub             w10, w10, w5
> +        dup             v30.8h, w6              // wx
> +        dup             v31.4s, w10             // shift
> +        dup             v29.4s, w7              // ox
> +.endm
> +
> +.macro QPEL_FILTER_B dst, src0, src1, src2, src3, src4, src5, src6, src7
> +        umull           \dst\().8h, \src1\().8b, v1.8b
> +        umlsl           \dst\().8h, \src0\().8b, v0.8b
> +        umlsl           \dst\().8h, \src2\().8b, v2.8b
> +        umlal           \dst\().8h, \src3\().8b, v3.8b
> +        umlal           \dst\().8h, \src4\().8b, v4.8b
> +        umlsl           \dst\().8h, \src5\().8b, v5.8b
> +        umlal           \dst\().8h, \src6\().8b, v6.8b
> +        umlsl           \dst\().8h, \src7\().8b, v7.8b
> +.endm
> +
> +.macro QPEL_FILTER_B2 dst, src0, src1, src2, src3, src4, src5, src6, src7
> +        umull2          \dst\().8h, \src1\().16b, v1.16b
> +        umlsl2          \dst\().8h, \src0\().16b, v0.16b
> +        umlsl2          \dst\().8h, \src2\().16b, v2.16b
> +        umlal2          \dst\().8h, \src3\().16b, v3.16b
> +        umlal2          \dst\().8h, \src4\().16b, v4.16b
> +        umlsl2          \dst\().8h, \src5\().16b, v5.16b
> +        umlal2          \dst\().8h, \src6\().16b, v6.16b
> +        umlsl2          \dst\().8h, \src7\().16b, v7.16b
> +.endm
> +
> +.macro  QPEL_UNI_W_V_4
> +        smull           v24.4s, v24.4h, v30.4h
> +        sqrshl          v24.4s, v24.4s, v31.4s
> +        sqadd           v24.4s, v24.4s, v29.4s
> +        sqxtn           v24.4h, v24.4s
> +        sqxtun          v24.8b, v24.8h
> +        st1             {v24.s}[0], [x0], x1
> +.endm
> +
> +function ff_hevc_put_hevc_qpel_uni_w_v4_8_neon, export=1
> +        QPEL_UNI_W_V_HEADER
> +        ldr             s16, [x2]
> +        ldr             s17, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        ldr             s18, [x2]
> +        ldr             s19, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        ldr             s20, [x2]
> +        ldr             s21, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        ldr             s22, [x2]
> +
> +1:      ldr             s23, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        QPEL_FILTER_B     v24, v16, v17, v18, v19, v20, v21, v22, v23
> +        QPEL_UNI_W_V_4
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             s16, [x2]
> +        QPEL_FILTER_B     v24, v17, v18, v19, v20, v21, v22, v23, v16
> +        QPEL_UNI_W_V_4
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             s17, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        QPEL_FILTER_B     v24, v18, v19, v20, v21, v22, v23, v16, v17
> +        QPEL_UNI_W_V_4
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             s18, [x2]
> +        QPEL_FILTER_B     v24, v19, v20, v21, v22, v23, v16, v17, v18
> +        QPEL_UNI_W_V_4
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             s19, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        QPEL_FILTER_B     v24, v20, v21, v22, v23, v16, v17, v18, v19
> +        QPEL_UNI_W_V_4
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             s20, [x2]
> +        QPEL_FILTER_B     v24, v21, v22, v23, v16, v17, v18, v19, v20
> +        QPEL_UNI_W_V_4
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             s21, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        QPEL_FILTER_B     v24, v22, v23, v16, v17, v18, v19, v20, v21
> +        QPEL_UNI_W_V_4
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             s22, [x2]
> +        QPEL_FILTER_B     v24, v23, v16, v17, v18, v19, v20, v21, v22
> +        QPEL_UNI_W_V_4
> +        subs            w4, w4, #1
> +        b.ne            1b
> +2:
> +        ret
> +endfunc
> +
> +.macro QPEL_UNI_W_V_8
> +        smull           v24.4s, v26.4h, v30.4h
> +        smull2          v25.4s, v26.8h, v30.8h
> +        sqrshl          v24.4s, v24.4s, v31.4s
> +        sqrshl          v25.4s, v25.4s, v31.4s
> +        sqadd           v24.4s, v24.4s, v29.4s
> +        sqadd           v25.4s, v25.4s, v29.4s
> +        sqxtn           v24.4h, v24.4s
> +        sqxtn2          v24.8h, v25.4s
> +        sqxtun          v24.8b, v24.8h
> +        st1             {v24.d}[0], [x0], x1
> +.endm
> +
> +function ff_hevc_put_hevc_qpel_uni_w_v8_8_neon, export=1
> +        QPEL_UNI_W_V_HEADER
> +        ldr             d16, [x2]
> +        ldr             d17, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        ldr             d18, [x2]
> +        ldr             d19, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        ldr             d20, [x2]
> +        ldr             d21, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        ldr             d22, [x2]
> +
> +1:      ldr             d23, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
> +        QPEL_UNI_W_V_8
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             d16, [x2]
> +        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
> +        QPEL_UNI_W_V_8
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             d17, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
> +        QPEL_UNI_W_V_8
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             d18, [x2]
> +        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
> +        QPEL_UNI_W_V_8
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             d19, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
> +        QPEL_UNI_W_V_8
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             d20, [x2]
> +        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
> +        QPEL_UNI_W_V_8
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             d21, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
> +        QPEL_UNI_W_V_8
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             d22, [x2]
> +        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
> +        QPEL_UNI_W_V_8
> +        subs            w4, w4, #1
> +        b.ne            1b
> +2:
> +        ret
> +endfunc
> +
> +.macro QPEL_UNI_W_V_16
> +        smull           v24.4s, v26.4h, v30.4h
> +        smull2          v25.4s, v26.8h, v30.8h
> +        smull           v26.4s, v27.4h, v30.4h
> +        smull2          v27.4s, v27.8h, v30.8h
> +        sqrshl          v24.4s, v24.4s, v31.4s
> +        sqrshl          v25.4s, v25.4s, v31.4s
> +        sqrshl          v26.4s, v26.4s, v31.4s
> +        sqrshl          v27.4s, v27.4s, v31.4s
> +        sqadd           v24.4s, v24.4s, v29.4s
> +        sqadd           v25.4s, v25.4s, v29.4s
> +        sqadd           v26.4s, v26.4s, v29.4s
> +        sqadd           v27.4s, v27.4s, v29.4s
> +        sqxtn           v24.4h, v24.4s
> +        sqxtn2          v24.8h, v25.4s
> +        sqxtn           v26.4h, v26.4s
> +        sqxtn2          v26.8h, v27.4s
> +        sqxtun          v24.8b, v24.8h
> +        sqxtun2         v24.16b, v26.8h
> +        st1             {v24.16b}, [x0], x1
> +.endm
> +
> +function ff_hevc_put_hevc_qpel_uni_w_v16_8_neon, export=1
> +        QPEL_UNI_W_V_HEADER
> +        ldr             q16, [x2]
> +        ldr             q17, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        ldr             q18, [x2]
> +        ldr             q19, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        ldr             q20, [x2]
> +        ldr             q21, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        ldr             q22, [x2]
> +
> +1:      ldr             q23, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
> +        QPEL_FILTER_B2    v27, v16, v17, v18, v19, v20, v21, v22, v23
> +        QPEL_UNI_W_V_16
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             q16, [x2]
> +        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
> +        QPEL_FILTER_B2    v27, v17, v18, v19, v20, v21, v22, v23, v16
> +        QPEL_UNI_W_V_16
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             q17, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
> +        QPEL_FILTER_B2    v27, v18, v19, v20, v21, v22, v23, v16, v17
> +        QPEL_UNI_W_V_16
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             q18, [x2]
> +        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
> +        QPEL_FILTER_B2    v27, v19, v20, v21, v22, v23, v16, v17, v18
> +        QPEL_UNI_W_V_16
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             q19, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
> +        QPEL_FILTER_B2    v27, v20, v21, v22, v23, v16, v17, v18, v19
> +        QPEL_UNI_W_V_16
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             q20, [x2]
> +        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
> +        QPEL_FILTER_B2    v27, v21, v22, v23, v16, v17, v18, v19, v20
> +        QPEL_UNI_W_V_16
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             q21, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
> +        QPEL_FILTER_B2    v27, v22, v23, v16, v17, v18, v19, v20, v21
> +        QPEL_UNI_W_V_16
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             q22, [x2]
> +        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
> +        QPEL_FILTER_B2    v27, v23, v16, v17, v18, v19, v20, v21, v22
> +        QPEL_UNI_W_V_16
> +        subs            w4, w4, #1
> +        b.ne            1b
> +2:
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1
> +        QPEL_UNI_W_V_HEADER
> +        ldur            w13, [sp, #16]
> +        mov             x14, x0
> +        mov             x15, x2
> +        mov             w11, w4
> +
> +3:
> +        ldr             q16, [x2]
> +        ldr             q17, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        ldr             q18, [x2]
> +        ldr             q19, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        ldr             q20, [x2]
> +        ldr             q21, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        ldr             q22, [x2]
> +
> +
> +1:      ldr             q23, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
> +        QPEL_FILTER_B2    v27, v16, v17, v18, v19, v20, v21, v22, v23
> +        QPEL_UNI_W_V_16
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             q16, [x2]
> +        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
> +        QPEL_FILTER_B2    v27, v17, v18, v19, v20, v21, v22, v23, v16
> +        QPEL_UNI_W_V_16
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             q17, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
> +        QPEL_FILTER_B2    v27, v18, v19, v20, v21, v22, v23, v16, v17
> +        QPEL_UNI_W_V_16
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             q18, [x2]
> +        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
> +        QPEL_FILTER_B2    v27, v19, v20, v21, v22, v23, v16, v17, v18
> +        QPEL_UNI_W_V_16
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             q19, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
> +        QPEL_FILTER_B2    v27, v20, v21, v22, v23, v16, v17, v18, v19
> +        QPEL_UNI_W_V_16
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             q20, [x2]
> +        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
> +        QPEL_FILTER_B2    v27, v21, v22, v23, v16, v17, v18, v19, v20
> +        QPEL_UNI_W_V_16
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             q21, [x2, x3]
> +        add             x2, x2, x3, lsl 1
> +        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
> +        QPEL_FILTER_B2    v27, v22, v23, v16, v17, v18, v19, v20, v21
> +        QPEL_UNI_W_V_16
> +        subs            w4, w4, #1
> +        b.eq            2f
> +
> +        ldr             q22, [x2]
> +        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
> +        QPEL_FILTER_B2    v27, v23, v16, v17, v18, v19, v20, v21, v22
> +        QPEL_UNI_W_V_16
> +        subs            w4, w4, #1
> +        b.ne            1b
> +2:
> +        subs            w13, w13, #16
> +        add             x14, x14, #16
> +        add             x15, x15, #16
> +        mov             x0, x14
> +        mov             x2, x15
> +        mov             w4, w11
> +        b.hi            3b
> +        ret
> +endfunc
> +
> +#if __ARM_FEATURE_DOTPROD
> +.macro QPEL_UNI_W_H_HEADER
> +        ldr             x12, [sp]
> +        sub             x2, x2, #3
> +        movrel          x9, qpel_filters
> +        add             x9, x9, x12, lsl 3
> +        ldr             x11, [x9]
> +        dup             v28.2d, x11
> +        mov             w10, #-6
> +        sub             w10, w10, w5
> +        dup             v30.4s, w6              // wx
> +        dup             v31.4s, w10             // shift
> +        dup             v29.4s, w7              // ox
> +.endm
> +
> +function ff_hevc_put_hevc_qpel_uni_w_h4_8_neon_dotprod, export=1
> +        QPEL_UNI_W_H_HEADER
> +1:
> +        ld1             {v0.16b}, [x2], x3
> +        ext             v1.16b, v0.16b, v0.16b, #1
> +        ext             v2.16b, v0.16b, v0.16b, #2
> +        ext             v3.16b, v0.16b, v0.16b, #3
> +        zip1            v0.2d, v0.2d, v1.2d
> +        zip1            v2.2d, v2.2d, v3.2d
> +        movi            v16.2d, #0
> +        movi            v17.2d, #0
> +        usdot           v16.4s, v0.16b, v28.16b
> +        usdot           v17.4s, v2.16b, v28.16b
> +        addp            v16.4s, v16.4s, v17.4s
> +        mul             v16.4s, v16.4s, v30.4s
> +        sqrshl          v16.4s, v16.4s, v31.4s
> +        sqadd           v16.4s, v16.4s, v29.4s
> +        sqxtn           v16.4h, v16.4s
> +        sqxtun          v16.8b, v16.8h
> +        str             s16, [x0]
> +        add             x0, x0, x1
> +        subs            w4, w4, #1
> +        b.hi            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_w_h6_8_neon_dotprod, export=1
> +        QPEL_UNI_W_H_HEADER
> +        sub             x1, x1, #4
> +1:
> +        ld1             {v0.16b}, [x2], x3
> +        ext             v1.16b, v0.16b, v0.16b, #1
> +        ext             v2.16b, v0.16b, v0.16b, #2
> +        ext             v3.16b, v0.16b, v0.16b, #3
> +        ext             v4.16b, v0.16b, v0.16b, #4
> +        ext             v5.16b, v0.16b, v0.16b, #5
> +        zip1            v0.2d, v0.2d, v1.2d
> +        zip1            v2.2d, v2.2d, v3.2d
> +        zip1            v4.2d, v4.2d, v5.2d
> +        movi            v16.2d, #0
> +        movi            v17.2d, #0
> +        movi            v18.2d, #0
> +        usdot           v16.4s, v0.16b, v28.16b
> +        usdot           v17.4s, v2.16b, v28.16b
> +        usdot           v18.4s, v4.16b, v28.16b
> +        addp            v16.4s, v16.4s, v17.4s
> +        addp            v18.4s, v18.4s, v18.4s
> +        mul             v16.4s, v16.4s, v30.4s
> +        mul             v18.2s, v18.2s, v30.2s
> +        sqrshl          v16.4s, v16.4s, v31.4s
> +        sqrshl          v18.2s, v18.2s, v31.2s
> +        sqadd           v16.4s, v16.4s, v29.4s
> +        sqadd           v18.2s, v18.2s, v29.2s
> +        sqxtn           v16.4h, v16.4s
> +        sqxtn2          v16.8h, v18.4s
> +        sqxtun          v16.8b, v16.8h
> +        str             s16, [x0], #4
> +        st1             {v16.h}[2], [x0], x1
> +        subs            w4, w4, #1
> +        b.hi            1b
> +        ret
> +endfunc
> +
> +
> +.macro  QPEL_UNI_W_H_CALC s0, s1, s2, s3, d0, d1, d2, d3
> +        movi            \d0\().2d, #0
> +        movi            \d1\().2d, #0
> +        movi            \d2\().2d, #0
> +        movi            \d3\().2d, #0
> +        usdot           \d0\().4s, \s0\().16b, v28.16b
> +        usdot           \d1\().4s, \s1\().16b, v28.16b
> +        usdot           \d2\().4s, \s2\().16b, v28.16b
> +        usdot           \d3\().4s, \s3\().16b, v28.16b
> +        addp            \d0\().4s, \d0\().4s, \d1\().4s
> +        addp            \d2\().4s, \d2\().4s, \d3\().4s
> +        mul             \d0\().4s, \d0\().4s, v30.4s
> +        mul             \d2\().4s, \d2\().4s, v30.4s
> +        sqrshl          \d0\().4s, \d0\().4s, v31.4s
> +        sqrshl          \d2\().4s, \d2\().4s, v31.4s
> +        sqadd           \d0\().4s, \d0\().4s, v29.4s
> +        sqadd           \d2\().4s, \d2\().4s, v29.4s
> +.endm
> +
> +.macro  QPEL_UNI_W_H_CALC_HALF s0, s1, d0, d1
> +        movi            \d0\().2d, #0
> +        movi            \d1\().2d, #0
> +        usdot           \d0\().4s, \s0\().16b, v28.16b
> +        usdot           \d1\().4s, \s1\().16b, v28.16b
> +        addp            \d0\().4s, \d0\().4s, \d1\().4s
> +        mul             \d0\().4s, \d0\().4s, v30.4s
> +        sqrshl          \d0\().4s, \d0\().4s, v31.4s
> +        sqadd           \d0\().4s, \d0\().4s, v29.4s
> +.endm
> +
> +
> +function ff_hevc_put_hevc_qpel_uni_w_h8_8_neon_dotprod, export=1
> +        QPEL_UNI_W_H_HEADER
> +1:
> +        ld1             {v16.16b, v17.16b}, [x2], x3
> +        ext             v1.16b, v16.16b, v17.16b, #1
> +        ext             v2.16b, v16.16b, v17.16b, #2
> +        ext             v3.16b, v16.16b, v17.16b, #3
> +        ext             v4.16b, v16.16b, v17.16b, #4
> +        ext             v5.16b, v16.16b, v17.16b, #5
> +        ext             v6.16b, v16.16b, v17.16b, #6
> +        ext             v7.16b, v16.16b, v17.16b, #7
> +        zip1            v0.2d, v16.2d, v1.2d
> +        zip1            v2.2d, v2.2d, v3.2d
> +        zip1            v4.2d, v4.2d, v5.2d
> +        zip1            v6.2d, v6.2d, v7.2d
> +        QPEL_UNI_W_H_CALC  v0, v2, v4, v6,  v18, v19, v20, v21
> +        sqxtn           v18.4h, v18.4s
> +        sqxtn2          v18.8h, v20.4s
> +        sqxtun          v18.8b, v18.8h
> +        str             d18, [x0]
> +        add             x0, x0, x1
> +        subs            w4, w4, #1
> +        b.hi            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_w_h12_8_neon_dotprod, export=1
> +        QPEL_UNI_W_H_HEADER
> +        add             x13, x0, #8
> +1:
> +        ld1             {v16.16b, v17.16b}, [x2], x3
> +        ext             v1.16b, v16.16b, v17.16b, #1
> +        ext             v2.16b, v16.16b, v17.16b, #2
> +        ext             v3.16b, v16.16b, v17.16b, #3
> +        ext             v4.16b, v16.16b, v17.16b, #4
> +        ext             v5.16b, v16.16b, v17.16b, #5
> +        ext             v6.16b, v16.16b, v17.16b, #6
> +        ext             v7.16b, v16.16b, v17.16b, #7
> +        zip1            v18.2d, v16.2d, v1.2d
> +        zip1            v19.2d, v2.2d, v3.2d
> +        zip1            v20.2d, v4.2d, v5.2d
> +        zip1            v21.2d, v6.2d, v7.2d
> +        zip2            v22.2d, v16.2d, v1.2d
> +        zip2            v23.2d, v2.2d, v3.2d
> +        QPEL_UNI_W_H_CALC  v18, v19, v20, v21, v0, v2, v4, v6
> +        QPEL_UNI_W_H_CALC_HALF v22, v23, v24, v25
> +        sqxtn           v0.4h, v0.4s
> +        sqxtn2          v0.8h, v4.4s
> +        sqxtn           v1.4h, v24.4s
> +        sqxtun          v0.8b, v0.8h
> +        sqxtun          v1.8b, v1.8h
> +
> +        str             d0, [x0]
> +        str             s1, [x13]
> +        add             x0, x0, x1
> +        add             x13, x13, x1
> +        subs            w4, w4, #1
> +        b.hi            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_w_h16_8_neon_dotprod, export=1
> +        QPEL_UNI_W_H_HEADER
> +1:
> +        ld1             {v16.16b, v17.16b}, [x2], x3
> +        ext             v1.16b, v16.16b, v17.16b, #1
> +        ext             v2.16b, v16.16b, v17.16b, #2
> +        ext             v3.16b, v16.16b, v17.16b, #3
> +        ext             v4.16b, v16.16b, v17.16b, #4
> +        ext             v5.16b, v16.16b, v17.16b, #5
> +        ext             v6.16b, v16.16b, v17.16b, #6
> +        ext             v7.16b, v16.16b, v17.16b, #7
> +        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v18, v19, v20, v21   // 
> v18: 0, 8, 2, 10 v20: 1, 9, 3, 11
> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25    // 
> v22: 4, 12, 6, 14 v24: 5, 13, 7, 15
> +        sqxtn           v0.4h, v18.4s
> +        sqxtn2          v0.8h, v22.4s
> +        sqxtn           v1.4h, v20.4s
> +        sqxtn2          v1.8h, v24.4s
> +        trn1            v2.8h, v0.8h, v1.8h
> +        trn2            v3.8h, v0.8h, v1.8h
> +        sqxtun          v0.8b, v2.8h
> +        sqxtun2         v0.16b, v3.8h
> +        st1             {v0.16b}, [x0], x1
> +        subs            w4, w4, #1
> +        b.hi            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_w_h24_8_neon_dotprod, export=1
> +        QPEL_UNI_W_H_HEADER
> +        sub             x1, x1, #16
> +1:
> +        ld1             {v16.16b, v17.16b}, [x2], x3
> +        ext             v1.16b, v16.16b, v17.16b, #1
> +        ext             v2.16b, v16.16b, v17.16b, #2
> +        ext             v3.16b, v16.16b, v17.16b, #3
> +        ext             v4.16b, v16.16b, v17.16b, #4
> +        ext             v5.16b, v16.16b, v17.16b, #5
> +        ext             v6.16b, v16.16b, v17.16b, #6
> +        ext             v7.16b, v16.16b, v17.16b, #7
> +        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v18, v19, v20, v21
> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25
> +        sqxtn           v18.4h, v18.4s
> +        sqxtn2          v18.8h, v22.4s
> +        sqxtn           v19.4h, v20.4s
> +        sqxtn2          v19.8h, v24.4s
> +        trn1            v20.8h, v18.8h, v19.8h
> +        trn2            v21.8h, v18.8h, v19.8h
> +        sqxtun          v26.8b, v20.8h
> +        sqxtun2         v26.16b, v21.8h                         // 0-15
> +        ext             v1.16b, v17.16b, v17.16b, #1
> +        ext             v2.16b, v17.16b, v17.16b, #2
> +        ext             v3.16b, v17.16b, v17.16b, #3
> +        ext             v4.16b, v17.16b, v17.16b, #4
> +        ext             v5.16b, v17.16b, v17.16b, #5
> +        ext             v6.16b, v17.16b, v17.16b, #6
> +        ext             v7.16b, v17.16b, v17.16b, #7
> +        zip1            v0.2d, v17.2d, v1.2d
> +        zip1            v2.2d, v2.2d, v3.2d
> +        zip1            v4.2d, v4.2d, v5.2d
> +        zip1            v6.2d, v6.2d, v7.2d
> +        QPEL_UNI_W_H_CALC  v0, v2, v4, v6, v18, v19, v20, v21
> +        sqxtn           v18.4h, v18.4s
> +        sqxtn2          v18.8h, v20.4s
> +        sqxtun          v27.8b, v18.8h
> +
> +        st1             {v26.16b}, [x0], #16
> +        st1             {v27.8b}, [x0], x1
> +        subs            w4, w4, #1
> +        b.hi            1b
> +        ret
> +endfunc
> +
> +
> +function ff_hevc_put_hevc_qpel_uni_w_h32_8_neon_dotprod, export=1
> +        QPEL_UNI_W_H_HEADER
> +1:
> +        ld1             {v16.16b, v17.16b, v18.16b}, [x2], x3
> +        ext             v1.16b, v16.16b, v17.16b, #1
> +        ext             v2.16b, v16.16b, v17.16b, #2
> +        ext             v3.16b, v16.16b, v17.16b, #3
> +        ext             v4.16b, v16.16b, v17.16b, #4
> +        ext             v5.16b, v16.16b, v17.16b, #5
> +        ext             v6.16b, v16.16b, v17.16b, #6
> +        ext             v7.16b, v16.16b, v17.16b, #7
> +        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v0, v19, v20, v21
> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25
> +        sqxtn           v0.4h, v0.4s
> +        sqxtn2          v0.8h, v22.4s
> +        sqxtn           v19.4h, v20.4s
> +        sqxtn2          v19.8h, v24.4s
> +        trn1            v20.8h, v0.8h, v19.8h
> +        trn2            v21.8h, v0.8h, v19.8h
> +        sqxtun          v26.8b, v20.8h
> +        sqxtun2         v26.16b, v21.8h                         // 0-15
> +        ext             v1.16b, v17.16b, v18.16b, #1
> +        ext             v2.16b, v17.16b, v18.16b, #2
> +        ext             v3.16b, v17.16b, v18.16b, #3
> +        ext             v4.16b, v17.16b, v18.16b, #4
> +        ext             v5.16b, v17.16b, v18.16b, #5
> +        ext             v6.16b, v17.16b, v18.16b, #6
> +        ext             v7.16b, v17.16b, v18.16b, #7
> +        QPEL_UNI_W_H_CALC  v17, v2, v1, v3, v0, v19, v20, v21
> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25
> +        sqxtn           v0.4h, v0.4s
> +        sqxtn2          v0.8h, v22.4s
> +        sqxtn           v19.4h, v20.4s
> +        sqxtn2          v19.8h, v24.4s
> +        trn1            v20.8h, v0.8h, v19.8h
> +        trn2            v21.8h, v0.8h, v19.8h
> +        sqxtun          v27.8b, v20.8h
> +        sqxtun2         v27.16b, v21.8h                         // 16-31
> +        st1             {v26.16b, v27.16b}, [x0], x1
> +        subs            w4, w4, #1
> +        b.hi            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_w_h48_8_neon_dotprod, export=1
> +        QPEL_UNI_W_H_HEADER
> +1:
> +        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
> +        ext             v1.16b, v16.16b, v17.16b, #1
> +        ext             v2.16b, v16.16b, v17.16b, #2
> +        ext             v3.16b, v16.16b, v17.16b, #3
> +        ext             v4.16b, v16.16b, v17.16b, #4
> +        ext             v5.16b, v16.16b, v17.16b, #5
> +        ext             v6.16b, v16.16b, v17.16b, #6
> +        ext             v7.16b, v16.16b, v17.16b, #7
> +        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v20, v24, v21, v0
> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
> +        sqxtn           v20.4h, v20.4s
> +        sqxtn2          v20.8h, v22.4s
> +        sqxtn           v21.4h, v21.4s
> +        sqxtn2          v21.8h, v23.4s
> +        trn1            v22.8h, v20.8h, v21.8h
> +        trn2            v23.8h, v20.8h, v21.8h
> +        sqxtun          v25.8b, v22.8h
> +        sqxtun2         v25.16b, v23.8h                         // 0-15
> +        ext             v1.16b, v17.16b, v18.16b, #1
> +        ext             v2.16b, v17.16b, v18.16b, #2
> +        ext             v3.16b, v17.16b, v18.16b, #3
> +        ext             v4.16b, v17.16b, v18.16b, #4
> +        ext             v5.16b, v17.16b, v18.16b, #5
> +        ext             v6.16b, v17.16b, v18.16b, #6
> +        ext             v7.16b, v17.16b, v18.16b, #7
> +        QPEL_UNI_W_H_CALC  v17, v2, v1, v3, v20, v24, v21, v0
> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
> +        sqxtn           v20.4h, v20.4s
> +        sqxtn2          v20.8h, v22.4s
> +        sqxtn           v21.4h, v21.4s
> +        sqxtn2          v21.8h, v23.4s
> +        trn1            v22.8h, v20.8h, v21.8h
> +        trn2            v23.8h, v20.8h, v21.8h
> +        sqxtun          v26.8b, v22.8h
> +        sqxtun2         v26.16b, v23.8h                         // 16-31
> +        ext             v1.16b, v18.16b, v19.16b, #1
> +        ext             v2.16b, v18.16b, v19.16b, #2
> +        ext             v3.16b, v18.16b, v19.16b, #3
> +        ext             v4.16b, v18.16b, v19.16b, #4
> +        ext             v5.16b, v18.16b, v19.16b, #5
> +        ext             v6.16b, v18.16b, v19.16b, #6
> +        ext             v7.16b, v18.16b, v19.16b, #7
> +        QPEL_UNI_W_H_CALC  v18, v2, v1, v3, v20, v24, v21, v0
> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
> +        sqxtn           v20.4h, v20.4s
> +        sqxtn2          v20.8h, v22.4s
> +        sqxtn           v21.4h, v21.4s
> +        sqxtn2          v21.8h, v23.4s
> +        trn1            v22.8h, v20.8h, v21.8h
> +        trn2            v23.8h, v20.8h, v21.8h
> +        sqxtun          v27.8b, v22.8h
> +        sqxtun2         v27.16b, v23.8h                         // 32-47
> +        st1             {v25.16b, v26.16b, v27.16b}, [x0], x1
> +        subs            w4, w4, #1
> +        b.hi            1b
> +        ret
> +endfunc
> +
> +
> +
> +function ff_hevc_put_hevc_qpel_uni_w_h64_8_neon_dotprod, export=1
> +        QPEL_UNI_W_H_HEADER
> +        sub             x3, x3, #64
> +1:
> +        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
> +        ext             v1.16b, v16.16b, v17.16b, #1
> +        ext             v2.16b, v16.16b, v17.16b, #2
> +        ext             v3.16b, v16.16b, v17.16b, #3
> +        ext             v4.16b, v16.16b, v17.16b, #4
> +        ext             v5.16b, v16.16b, v17.16b, #5
> +        ext             v6.16b, v16.16b, v17.16b, #6
> +        ext             v7.16b, v16.16b, v17.16b, #7
> +        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v20, v24, v21, v0
> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
> +        sqxtn           v20.4h, v20.4s
> +        sqxtn2          v20.8h, v22.4s
> +        sqxtn           v21.4h, v21.4s
> +        sqxtn2          v21.8h, v23.4s
> +        trn1            v22.8h, v20.8h, v21.8h
> +        trn2            v23.8h, v20.8h, v21.8h
> +        sqxtun          v16.8b, v22.8h
> +        sqxtun2         v16.16b, v23.8h                         // 0-15
> +        ext             v1.16b, v17.16b, v18.16b, #1
> +        ext             v2.16b, v17.16b, v18.16b, #2
> +        ext             v3.16b, v17.16b, v18.16b, #3
> +        ext             v4.16b, v17.16b, v18.16b, #4
> +        ext             v5.16b, v17.16b, v18.16b, #5
> +        ext             v6.16b, v17.16b, v18.16b, #6
> +        ext             v7.16b, v17.16b, v18.16b, #7
> +        QPEL_UNI_W_H_CALC  v17, v2, v1, v3, v20, v24, v21, v0
> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
> +        sqxtn           v20.4h, v20.4s
> +        sqxtn2          v20.8h, v22.4s
> +        sqxtn           v21.4h, v21.4s
> +        sqxtn2          v21.8h, v23.4s
> +        trn1            v22.8h, v20.8h, v21.8h
> +        trn2            v23.8h, v20.8h, v21.8h
> +        sqxtun          v17.8b, v22.8h
> +        sqxtun2         v17.16b, v23.8h                         // 16-31
> +        ext             v1.16b, v18.16b, v19.16b, #1
> +        ext             v2.16b, v18.16b, v19.16b, #2
> +        ext             v3.16b, v18.16b, v19.16b, #3
> +        ext             v4.16b, v18.16b, v19.16b, #4
> +        ext             v5.16b, v18.16b, v19.16b, #5
> +        ext             v6.16b, v18.16b, v19.16b, #6
> +        ext             v7.16b, v18.16b, v19.16b, #7
> +        QPEL_UNI_W_H_CALC  v18, v2, v1, v3, v20, v24, v21, v0
> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
> +        ld1             {v0.16b}, [x2], x3
> +        sqxtn           v20.4h, v20.4s
> +        sqxtn2          v20.8h, v22.4s
> +        sqxtn           v21.4h, v21.4s
> +        sqxtn2          v21.8h, v23.4s
> +        trn1            v22.8h, v20.8h, v21.8h
> +        trn2            v23.8h, v20.8h, v21.8h
> +        sqxtun          v18.8b, v22.8h
> +        sqxtun2         v18.16b, v23.8h                         // 32-47
> +        ext             v1.16b, v19.16b, v0.16b, #1
> +        ext             v2.16b, v19.16b, v0.16b, #2
> +        ext             v3.16b, v19.16b, v0.16b, #3
> +        ext             v4.16b, v19.16b, v0.16b, #4
> +        ext             v5.16b, v19.16b, v0.16b, #5
> +        ext             v6.16b, v19.16b, v0.16b, #6
> +        ext             v7.16b, v19.16b, v0.16b, #7
> +        QPEL_UNI_W_H_CALC  v19, v2, v1, v3, v20, v24, v21, v0
> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
> +        sqxtn           v20.4h, v20.4s
> +        sqxtn2          v20.8h, v22.4s
> +        sqxtn           v21.4h, v21.4s
> +        sqxtn2          v21.8h, v23.4s
> +        trn1            v22.8h, v20.8h, v21.8h
> +        trn2            v23.8h, v20.8h, v21.8h
> +        sqxtun          v19.8b, v22.8h
> +        sqxtun2         v19.16b, v23.8h                         // 48-63
> +
> +        st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
> +        subs            w4, w4, #1
> +        b.hi            1b
> +        ret
> +endfunc
> +
> +
> +.macro QPEL_H_HEADER
> +        movrel          x9, qpel_filters
> +        add             x9, x9, x4, lsl 3
> +        ldr             x11, [x9]
> +        dup             v31.2d, x11
> +        sub             x1, x1, #3
> +.endm
> +
> +function ff_hevc_put_hevc_qpel_h4_8_neon_dotprod, export=1
> +        QPEL_H_HEADER
> +        mov             x10, #MAX_PB_SIZE * 2
> +1:
> +        ld1             {v0.16b}, [x1], x2
> +        ext             v1.16b, v0.16b, v0.16b, #1
> +        ext             v2.16b, v0.16b, v0.16b, #2
> +        ext             v3.16b, v0.16b, v0.16b, #3
> +        zip1            v0.2d, v0.2d, v1.2d
> +        zip1            v2.2d, v2.2d, v3.2d
> +        movi            v16.2d, #0
> +        movi            v17.2d, #0
> +        usdot           v16.4s, v0.16b, v31.16b
> +        usdot           v17.4s, v2.16b, v31.16b
> +        addp            v16.4s, v16.4s, v17.4s
> +        sqxtn           v16.4h, v16.4s
> +        str             d16, [x0]
> +        add             x0, x0, x10
> +        subs            w3, w3, #1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h6_8_neon_dotprod, export=1
> +        QPEL_H_HEADER
> +        mov             x10, #MAX_PB_SIZE * 2
> +        add             x15, x0, #8
> +1:
> +        ld1             {v0.16b}, [x1], x2
> +        ext             v1.16b, v0.16b, v0.16b, #1
> +        ext             v2.16b, v0.16b, v0.16b, #2
> +        ext             v3.16b, v0.16b, v0.16b, #3
> +        ext             v4.16b, v0.16b, v0.16b, #4
> +        ext             v5.16b, v0.16b, v0.16b, #5
> +        zip1            v0.2d, v0.2d, v1.2d
> +        zip1            v2.2d, v2.2d, v3.2d
> +        zip1            v4.2d, v4.2d, v5.2d
> +        movi            v16.2d, #0
> +        movi            v17.2d, #0
> +        movi            v18.2d, #0
> +        usdot           v16.4s, v0.16b, v31.16b
> +        usdot           v17.4s, v2.16b, v31.16b
> +        usdot           v18.4s, v4.16b, v31.16b
> +        addp            v16.4s, v16.4s, v17.4s
> +        addp            v18.4s, v18.4s, v18.4s
> +        sqxtn           v16.4h, v16.4s
> +        sqxtn           v18.4h, v18.4s
> +        str             d16, [x0]
> +        str             s18, [x15]
> +        add             x0, x0, x10
> +        add             x15, x15, x10
> +        subs            w3, w3, #1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h8_8_neon_dotprod, export=1
> +        QPEL_H_HEADER
> +        mov             x10, #MAX_PB_SIZE * 2
> +1:
> +        ld1             {v0.16b}, [x1], x2
> +        ext             v1.16b, v0.16b, v0.16b, #1
> +        ext             v2.16b, v0.16b, v0.16b, #2
> +        ext             v3.16b, v0.16b, v0.16b, #3
> +        ext             v4.16b, v0.16b, v0.16b, #4
> +        ext             v5.16b, v0.16b, v0.16b, #5
> +        ext             v6.16b, v0.16b, v0.16b, #6
> +        ext             v7.16b, v0.16b, v0.16b, #7
> +        zip1            v0.2d, v0.2d, v1.2d
> +        zip1            v2.2d, v2.2d, v3.2d
> +        zip1            v4.2d, v4.2d, v5.2d
> +        zip1            v6.2d, v6.2d, v7.2d
> +        movi            v16.2d, #0
> +        movi            v17.2d, #0
> +        movi            v18.2d, #0
> +        movi            v19.2d, #0
> +        usdot           v16.4s, v0.16b, v31.16b
> +        usdot           v17.4s, v2.16b, v31.16b
> +        usdot           v18.4s, v4.16b, v31.16b
> +        usdot           v19.4s, v6.16b, v31.16b
> +        addp            v16.4s, v16.4s, v17.4s
> +        addp            v18.4s, v18.4s, v19.4s
> +        sqxtn           v16.4h, v16.4s
> +        sqxtn2          v16.8h, v18.4s
> +        str             q16, [x0]
> +        add             x0, x0, x10
> +        subs            w3, w3, #1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +.macro QPEL_H_CALC s0, s1, s2, s3, d0, d1, d2, d3 +        movi 
>     \d0\().2d, #0
> +        movi            \d1\().2d, #0
> +        movi            \d2\().2d, #0
> +        movi            \d3\().2d, #0
> +        usdot           \d0\().4s, \s0\().16b, v31.16b
> +        usdot           \d1\().4s, \s1\().16b, v31.16b
> +        usdot           \d2\().4s, \s2\().16b, v31.16b
> +        usdot           \d3\().4s, \s3\().16b, v31.16b
> +.endm
> +
> +function ff_hevc_put_hevc_qpel_h12_8_neon_dotprod, export=1
> +        QPEL_H_HEADER
> +        mov             x10, #MAX_PB_SIZE * 2
> +        add             x15, x0, #16
> +1:
> +        ld1             {v16.16b, v17.16b}, [x1], x2
> +        ext             v1.16b, v16.16b, v17.16b, #1
> +        ext             v2.16b, v16.16b, v17.16b, #2
> +        ext             v3.16b, v16.16b, v17.16b, #3
> +        ext             v4.16b, v16.16b, v17.16b, #4
> +        ext             v5.16b, v16.16b, v17.16b, #5
> +        ext             v6.16b, v16.16b, v17.16b, #6
> +        ext             v7.16b, v16.16b, v17.16b, #7
> +        zip1            v18.2d, v4.2d, v5.2d
> +        zip1            v19.2d, v6.2d, v7.2d
> +        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
> +        addp            v20.4s, v20.4s, v22.4s
> +        addp            v21.4s, v21.4s, v23.4s
> +        movi            v24.2d, #0
> +        movi            v25.2d, #0
> +        usdot           v24.4s, v18.16b, v31.16b
> +        usdot           v25.4s, v19.16b, v31.16b
> +        addp            v24.4s, v24.4s, v25.4s
> +        trn1            v26.4s, v20.4s, v21.4s
> +        trn2            v27.4s, v20.4s, v21.4s
> +        sqxtn           v26.4h, v26.4s
> +        sqxtn           v27.4h, v27.4s
> +        sqxtn2          v26.8h, v24.4s
> +
> +        str             q26, [x0]
> +        str             d27, [x15]
> +        add             x0, x0, x10
> +        add             x15, x15, x10
> +        subs            w3, w3, #1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h16_8_neon_dotprod, export=1
> +        QPEL_H_HEADER
> +        mov             x10, #MAX_PB_SIZE * 2
> +1:
> +        ld1             {v16.16b, v17.16b}, [x1], x2
> +        ext             v1.16b, v16.16b, v17.16b, #1
> +        ext             v2.16b, v16.16b, v17.16b, #2
> +        ext             v3.16b, v16.16b, v17.16b, #3
> +        ext             v4.16b, v16.16b, v17.16b, #4
> +        ext             v5.16b, v16.16b, v17.16b, #5
> +        ext             v6.16b, v16.16b, v17.16b, #6
> +        ext             v7.16b, v16.16b, v17.16b, #7
> +
> +        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
> +
> +        addp            v20.4s, v20.4s, v22.4s
> +        addp            v21.4s, v21.4s, v23.4s
> +        addp            v24.4s, v24.4s, v26.4s
> +        addp            v25.4s, v25.4s, v27.4s
> +
> +        trn1            v22.4s, v20.4s, v21.4s
> +        trn2            v23.4s, v20.4s, v21.4s
> +        trn1            v26.4s, v24.4s, v25.4s
> +        trn2            v27.4s, v24.4s, v25.4s
> +
> +        sqxtn           v18.4h, v22.4s
> +        sqxtn2          v18.8h, v26.4s
> +        sqxtn           v19.4h, v23.4s
> +        sqxtn2          v19.8h, v27.4s
> +        +        stp             q18, q19, [x0]
> +        add             x0, x0, x10
> +        subs            w3, w3, #1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h24_8_neon_dotprod, export=1
> +        QPEL_H_HEADER
> +        mov             x10, #MAX_PB_SIZE * 2
> +        add             x15, x0, #32
> +1:
> +        ld1             {v16.16b, v17.16b}, [x1], x2
> +        ext             v1.16b, v16.16b, v17.16b, #1
> +        ext             v2.16b, v16.16b, v17.16b, #2
> +        ext             v3.16b, v16.16b, v17.16b, #3
> +        ext             v4.16b, v16.16b, v17.16b, #4
> +        ext             v5.16b, v16.16b, v17.16b, #5
> +        ext             v6.16b, v16.16b, v17.16b, #6
> +        ext             v7.16b, v16.16b, v17.16b, #7
> +        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
> +        addp            v20.4s, v20.4s, v22.4s
> +        addp            v21.4s, v21.4s, v23.4s
> +        addp            v24.4s, v24.4s, v26.4s
> +        addp            v25.4s, v25.4s, v27.4s
> +        trn1            v22.4s, v20.4s, v21.4s
> +        trn2            v23.4s, v20.4s, v21.4s
> +        trn1            v26.4s, v24.4s, v25.4s
> +        trn2            v27.4s, v24.4s, v25.4s
> +        sqxtn           v18.4h, v22.4s
> +        sqxtn2          v18.8h, v26.4s
> +        sqxtn           v19.4h, v23.4s
> +        sqxtn2          v19.8h, v27.4s
> +        stp             q18, q19, [x0]
> +        add             x0, x0, x10
> +        ext             v1.16b, v17.16b, v17.16b, #1
> +        ext             v2.16b, v17.16b, v17.16b, #2
> +        ext             v3.16b, v17.16b, v17.16b, #3
> +        ext             v4.16b, v17.16b, v17.16b, #4
> +        ext             v5.16b, v17.16b, v17.16b, #5
> +        ext             v6.16b, v17.16b, v17.16b, #6
> +        ext             v7.16b, v17.16b, v17.16b, #7
> +        zip1            v0.2d, v17.2d, v1.2d
> +        zip1            v2.2d, v2.2d, v3.2d
> +        zip1            v4.2d, v4.2d, v5.2d
> +        zip1            v6.2d, v6.2d, v7.2d
> +        QPEL_H_CALC     v0, v2, v4, v5, v20, v21, v22, v23
> +        addp            v20.4s, v20.4s, v21.4s
> +        addp            v22.4s, v22.4s, v23.4s
> +        sqxtn           v20.4h, v20.4s
> +        sqxtn2          v20.8h, v22.4s
> +        str             q20, [x15]
> +        add             x15, x15, x10
> +        subs            w3, w3, #1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h32_8_neon_dotprod, export=1
> +        QPEL_H_HEADER
> +        mov             x10, #MAX_PB_SIZE * 2
> +        add             x15, x0, #32
> +1:
> +        ld1             {v16.16b, v17.16b, v18.16b}, [x1], x2
> +        ext             v1.16b, v16.16b, v17.16b, #1
> +        ext             v2.16b, v16.16b, v17.16b, #2
> +        ext             v3.16b, v16.16b, v17.16b, #3
> +        ext             v4.16b, v16.16b, v17.16b, #4
> +        ext             v5.16b, v16.16b, v17.16b, #5
> +        ext             v6.16b, v16.16b, v17.16b, #6
> +        ext             v7.16b, v16.16b, v17.16b, #7
> +        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
> +        addp            v20.4s, v20.4s, v22.4s
> +        addp            v21.4s, v21.4s, v23.4s
> +        addp            v24.4s, v24.4s, v26.4s
> +        addp            v25.4s, v25.4s, v27.4s
> +        trn1            v22.4s, v20.4s, v21.4s
> +        trn2            v23.4s, v20.4s, v21.4s
> +        trn1            v26.4s, v24.4s, v25.4s
> +        trn2            v27.4s, v24.4s, v25.4s
> +        sqxtn           v20.4h, v22.4s
> +        sqxtn2          v20.8h, v26.4s
> +        sqxtn           v21.4h, v23.4s
> +        sqxtn2          v21.8h, v27.4s
> +        stp             q20, q21, [x0]
> +        add             x0, x0, x10
> +        ext             v1.16b, v17.16b, v18.16b, #1
> +        ext             v2.16b, v17.16b, v18.16b, #2
> +        ext             v3.16b, v17.16b, v18.16b, #3
> +        ext             v4.16b, v17.16b, v18.16b, #4
> +        ext             v5.16b, v17.16b, v18.16b, #5
> +        ext             v6.16b, v17.16b, v18.16b, #6
> +        ext             v7.16b, v17.16b, v18.16b, #7
> +        QPEL_H_CALC     v17, v1, v2, v3, v20, v21, v22, v23
> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
> +        addp            v20.4s, v20.4s, v22.4s
> +        addp            v21.4s, v21.4s, v23.4s
> +        addp            v24.4s, v24.4s, v26.4s
> +        addp            v25.4s, v25.4s, v27.4s
> +        trn1            v22.4s, v20.4s, v21.4s
> +        trn2            v23.4s, v20.4s, v21.4s
> +        trn1            v26.4s, v24.4s, v25.4s
> +        trn2            v27.4s, v24.4s, v25.4s
> +        sqxtn           v20.4h, v22.4s
> +        sqxtn2          v20.8h, v26.4s
> +        sqxtn           v21.4h, v23.4s
> +        sqxtn2          v21.8h, v27.4s
> +        stp             q20, q21, [x15]
> +        add             x15, x15, x10
> +        subs            w3, w3, #1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h48_8_neon_dotprod, export=1
> +        QPEL_H_HEADER
> +        mov             x10, #MAX_PB_SIZE * 2 - 64
> +1:
> +        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2
> +        ext             v1.16b, v16.16b, v17.16b, #1
> +        ext             v2.16b, v16.16b, v17.16b, #2
> +        ext             v3.16b, v16.16b, v17.16b, #3
> +        ext             v4.16b, v16.16b, v17.16b, #4
> +        ext             v5.16b, v16.16b, v17.16b, #5
> +        ext             v6.16b, v16.16b, v17.16b, #6
> +        ext             v7.16b, v16.16b, v17.16b, #7
> +        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
> +        addp            v20.4s, v20.4s, v22.4s
> +        addp            v21.4s, v21.4s, v23.4s
> +        addp            v24.4s, v24.4s, v26.4s
> +        addp            v25.4s, v25.4s, v27.4s
> +        trn1            v22.4s, v20.4s, v21.4s
> +        trn2            v23.4s, v20.4s, v21.4s
> +        trn1            v26.4s, v24.4s, v25.4s
> +        trn2            v27.4s, v24.4s, v25.4s
> +        sqxtn           v20.4h, v22.4s
> +        sqxtn2          v20.8h, v26.4s
> +        sqxtn           v21.4h, v23.4s
> +        sqxtn2          v21.8h, v27.4s
> +        stp             q20, q21, [x0], #32
> +
> +        ext             v1.16b, v17.16b, v18.16b, #1
> +        ext             v2.16b, v17.16b, v18.16b, #2
> +        ext             v3.16b, v17.16b, v18.16b, #3
> +        ext             v4.16b, v17.16b, v18.16b, #4
> +        ext             v5.16b, v17.16b, v18.16b, #5
> +        ext             v6.16b, v17.16b, v18.16b, #6
> +        ext             v7.16b, v17.16b, v18.16b, #7
> +        QPEL_H_CALC     v17, v1, v2, v3, v20, v21, v22, v23
> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
> +        addp            v20.4s, v20.4s, v22.4s
> +        addp            v21.4s, v21.4s, v23.4s
> +        addp            v24.4s, v24.4s, v26.4s
> +        addp            v25.4s, v25.4s, v27.4s
> +        trn1            v22.4s, v20.4s, v21.4s
> +        trn2            v23.4s, v20.4s, v21.4s
> +        trn1            v26.4s, v24.4s, v25.4s
> +        trn2            v27.4s, v24.4s, v25.4s
> +        sqxtn           v20.4h, v22.4s
> +        sqxtn2          v20.8h, v26.4s
> +        sqxtn           v21.4h, v23.4s
> +        sqxtn2          v21.8h, v27.4s
> +        stp             q20, q21, [x0], #32
> +        ext             v1.16b, v18.16b, v19.16b, #1
> +        ext             v2.16b, v18.16b, v19.16b, #2
> +        ext             v3.16b, v18.16b, v19.16b, #3
> +        ext             v4.16b, v18.16b, v19.16b, #4
> +        ext             v5.16b, v18.16b, v19.16b, #5
> +        ext             v6.16b, v18.16b, v19.16b, #6
> +        ext             v7.16b, v18.16b, v19.16b, #7
> +        QPEL_H_CALC     v18, v1, v2, v3, v20, v21, v22, v23
> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
> +        addp            v20.4s, v20.4s, v22.4s
> +        addp            v21.4s, v21.4s, v23.4s
> +        addp            v24.4s, v24.4s, v26.4s
> +        addp            v25.4s, v25.4s, v27.4s
> +        trn1            v22.4s, v20.4s, v21.4s
> +        trn2            v23.4s, v20.4s, v21.4s
> +        trn1            v26.4s, v24.4s, v25.4s
> +        trn2            v27.4s, v24.4s, v25.4s
> +        sqxtn           v20.4h, v22.4s
> +        sqxtn2          v20.8h, v26.4s
> +        sqxtn           v21.4h, v23.4s
> +        sqxtn2          v21.8h, v27.4s
> +        stp             q20, q21, [x0]
> +        add             x0, x0, x10
> +        subs            w3, w3, #1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h64_8_neon_dotprod, export=1
> +        QPEL_H_HEADER
> +        sub             x2, x2, #64
> +1:
> +        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64
> +        ext             v1.16b, v16.16b, v17.16b, #1
> +        ext             v2.16b, v16.16b, v17.16b, #2
> +        ext             v3.16b, v16.16b, v17.16b, #3
> +        ext             v4.16b, v16.16b, v17.16b, #4
> +        ext             v5.16b, v16.16b, v17.16b, #5
> +        ext             v6.16b, v16.16b, v17.16b, #6
> +        ext             v7.16b, v16.16b, v17.16b, #7
> +        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
> +        addp            v20.4s, v20.4s, v22.4s
> +        addp            v21.4s, v21.4s, v23.4s
> +        addp            v24.4s, v24.4s, v26.4s
> +        addp            v25.4s, v25.4s, v27.4s
> +        trn1            v22.4s, v20.4s, v21.4s
> +        trn2            v23.4s, v20.4s, v21.4s
> +        trn1            v26.4s, v24.4s, v25.4s
> +        trn2            v27.4s, v24.4s, v25.4s
> +        sqxtn           v20.4h, v22.4s
> +        sqxtn2          v20.8h, v26.4s
> +        sqxtn           v21.4h, v23.4s
> +        sqxtn2          v21.8h, v27.4s
> +        stp             q20, q21, [x0], #32
> +
> +        ext             v1.16b, v17.16b, v18.16b, #1
> +        ext             v2.16b, v17.16b, v18.16b, #2
> +        ext             v3.16b, v17.16b, v18.16b, #3
> +        ext             v4.16b, v17.16b, v18.16b, #4
> +        ext             v5.16b, v17.16b, v18.16b, #5
> +        ext             v6.16b, v17.16b, v18.16b, #6
> +        ext             v7.16b, v17.16b, v18.16b, #7
> +        QPEL_H_CALC     v17, v1, v2, v3, v20, v21, v22, v23
> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
> +        addp            v20.4s, v20.4s, v22.4s
> +        addp            v21.4s, v21.4s, v23.4s
> +        addp            v24.4s, v24.4s, v26.4s
> +        addp            v25.4s, v25.4s, v27.4s
> +        trn1            v22.4s, v20.4s, v21.4s
> +        trn2            v23.4s, v20.4s, v21.4s
> +        trn1            v26.4s, v24.4s, v25.4s
> +        trn2            v27.4s, v24.4s, v25.4s
> +        sqxtn           v20.4h, v22.4s
> +        sqxtn2          v20.8h, v26.4s
> +        sqxtn           v21.4h, v23.4s
> +        sqxtn2          v21.8h, v27.4s
> +        stp             q20, q21, [x0], #32
> +        ext             v1.16b, v18.16b, v19.16b, #1
> +        ext             v2.16b, v18.16b, v19.16b, #2
> +        ext             v3.16b, v18.16b, v19.16b, #3
> +        ext             v4.16b, v18.16b, v19.16b, #4
> +        ext             v5.16b, v18.16b, v19.16b, #5
> +        ext             v6.16b, v18.16b, v19.16b, #6
> +        ext             v7.16b, v18.16b, v19.16b, #7
> +        QPEL_H_CALC     v18, v1, v2, v3, v20, v21, v22, v23
> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
> +        addp            v20.4s, v20.4s, v22.4s
> +        addp            v21.4s, v21.4s, v23.4s
> +        addp            v24.4s, v24.4s, v26.4s
> +        addp            v25.4s, v25.4s, v27.4s
> +        trn1            v22.4s, v20.4s, v21.4s
> +        trn2            v23.4s, v20.4s, v21.4s
> +        trn1            v26.4s, v24.4s, v25.4s
> +        trn2            v27.4s, v24.4s, v25.4s
> +        sqxtn           v20.4h, v22.4s
> +        sqxtn2          v20.8h, v26.4s
> +        sqxtn           v21.4h, v23.4s
> +        sqxtn2          v21.8h, v27.4s
> +        stp             q20, q21, [x0], #32
> +        ld1             {v28.8b}, [x1], x2
> +        ext             v1.16b, v19.16b, v28.16b, #1
> +        ext             v2.16b, v19.16b, v28.16b, #2
> +        ext             v3.16b, v19.16b, v28.16b, #3
> +        ext             v4.16b, v19.16b, v28.16b, #4
> +        ext             v5.16b, v19.16b, v28.16b, #5
> +        ext             v6.16b, v19.16b, v28.16b, #6
> +        ext             v7.16b, v19.16b, v28.16b, #7
> +        QPEL_H_CALC     v19, v1, v2, v3, v20, v21, v22, v23
> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
> +        addp            v20.4s, v20.4s, v22.4s
> +        addp            v21.4s, v21.4s, v23.4s
> +        addp            v24.4s, v24.4s, v26.4s
> +        addp            v25.4s, v25.4s, v27.4s
> +        trn1            v22.4s, v20.4s, v21.4s
> +        trn2            v23.4s, v20.4s, v21.4s
> +        trn1            v26.4s, v24.4s, v25.4s
> +        trn2            v27.4s, v24.4s, v25.4s
> +        sqxtn           v20.4h, v22.4s
> +        sqxtn2          v20.8h, v26.4s
> +        sqxtn           v21.4h, v23.4s
> +        sqxtn2          v21.8h, v27.4s
> +        stp             q20, q21, [x0], #32
> +        subs            w3, w3, #1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +.macro QPEL_UNI_W_HV_HEADER width
> +        ldp             x14, x15, [sp]          // mx, my
> +        ldr             w13, [sp, #16]          // width
> +        stp             x20, x21, [sp, #-16]!
> +        stp             x22, x23, [sp, #-16]!
> +        stp             x24, x25, [sp, #-16]!
> +        stp             x26, x27, [sp, #-16]!
> +        stp             x28, x30, [sp, #-16]!
> +        mov             x28, sp
> +        mov             x11, #9088
> +        sub             sp, sp, x11
> +        mov             x20, x0
> +        mov             x21, x1
> +        mov             x0, sp
> +        sub             x1, x2, x3, lsl 1
> +        sub             x1, x1, x3
> +        mov             x2, x3
> +        add             w3, w4, #7
> +        mov             w22, w4                 // height
> +        mov             x4, x14                 // mx
> +        mov             x23, x15                // my
> +        mov             w24, w6                 // wx
> +        mov             w25, w7                 // ox
> +        mov             w26, #-6
> +        sub             w26, w26, w5            // -shift
> +        mov             w27, w13                // width
> +        bl              X(ff_hevc_put_hevc_qpel_h\width\()_8_neon_dotprod)
> +        movrel          x9, qpel_filters
> +        add             x9, x9, x23, lsl 3
> +        ld1             {v0.8b}, [x9]
> +        sxtl            v0.8h, v0.8b
> +        mov             x10, #(MAX_PB_SIZE * 2)
> +        dup             v28.4s, w24
> +        dup             v29.4s, w25
> +        dup             v30.4s, w26
> +.endm
> +
> +.macro QPEL_UNI_W_HV_END
> +        mov             sp, x28
> +        ldp             x28, x30, [sp], #16
> +        ldp             x26, x27, [sp], #16
> +        ldp             x24, x25, [sp], #16
> +        ldp             x22, x23, [sp], #16
> +        ldp             x20, x21, [sp], #16
> +.endm
> +
> +.macro QPEL_UNI_W_HV_4
> +        sshr            v26.4s, v26.4s, #6
> +        mul             v24.4s, v26.4s, v28.4s
> +        sqrshl          v24.4s, v24.4s, v30.4s
> +        sqadd           v24.4s, v24.4s, v29.4s
> +        sqxtn           v24.4h, v24.4s
> +        sqxtun          v24.8b, v24.8h
> +        st1             {v24.s}[0], [x20], x21
> +.endm
> +
> +.macro QPEL_FILTER_H    dst, src0, src1, src2, src3, src4, src5, src6, src7
> +        smull           \dst\().4s, \src0\().4h, v0.h[0]
> +        smlal           \dst\().4s, \src1\().4h, v0.h[1]
> +        smlal           \dst\().4s, \src2\().4h, v0.h[2]
> +        smlal           \dst\().4s, \src3\().4h, v0.h[3]
> +        smlal           \dst\().4s, \src4\().4h, v0.h[4]
> +        smlal           \dst\().4s, \src5\().4h, v0.h[5]
> +        smlal           \dst\().4s, \src6\().4h, v0.h[6]
> +        smlal           \dst\().4s, \src7\().4h, v0.h[7]
> +.endm
> +
> +.macro QPEL_FILTER_H2    dst, src0, src1, src2, src3, src4, src5, src6, 
> src7
> +        smull2          \dst\().4s, \src0\().8h, v0.h[0]
> +        smlal2          \dst\().4s, \src1\().8h, v0.h[1]
> +        smlal2          \dst\().4s, \src2\().8h, v0.h[2]
> +        smlal2          \dst\().4s, \src3\().8h, v0.h[3]
> +        smlal2          \dst\().4s, \src4\().8h, v0.h[4]
> +        smlal2          \dst\().4s, \src5\().8h, v0.h[5]
> +        smlal2          \dst\().4s, \src6\().8h, v0.h[6]
> +        smlal2          \dst\().4s, \src7\().8h, v0.h[7]
> +.endm
> +
> +function ff_hevc_put_hevc_qpel_uni_w_hv4_8_neon_dotprod, export=1
> +        QPEL_UNI_W_HV_HEADER 4
> +        ldr             d16, [sp]
> +        ldr             d17, [sp, x10]
> +        add             sp, sp, x10, lsl 1
> +        ldr             d18, [sp]
> +        ldr             d19, [sp, x10]
> +        add             sp, sp, x10, lsl 1
> +        ldr             d20, [sp]
> +        ldr             d21, [sp, x10]
> +        add             sp, sp, x10, lsl 1
> +        ldr             d22, [sp]
> +        add             sp, sp, x10
> +1:
> +        ldr             d23, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v26, v16, v17, v18, v19, v20, v21, v22, v23
> +        QPEL_UNI_W_HV_4
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldr             d16, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v26, v17, v18, v19, v20, v21, v22, v23, v16
> +        QPEL_UNI_W_HV_4
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldr             d17, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v26, v18, v19, v20, v21, v22, v23, v16, v17
> +        QPEL_UNI_W_HV_4
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldr             d18, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v26, v19, v20, v21, v22, v23, v16, v17, v18
> +        QPEL_UNI_W_HV_4
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldr             d19, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v26, v20, v21, v22, v23, v16, v17, v18, v19
> +        QPEL_UNI_W_HV_4
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldr             d20, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v26, v21, v22, v23, v16, v17, v18, v19, v20
> +        QPEL_UNI_W_HV_4
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldr             d21, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v26, v22, v23, v16, v17, v18, v19, v20, v21
> +        QPEL_UNI_W_HV_4
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldr             d22, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v26, v23, v16, v17, v18, v19, v20, v21, v22
> +        QPEL_UNI_W_HV_4
> +        subs            w22, w22, #1
> +        b.hi            1b
> +
> +2:
> +        QPEL_UNI_W_HV_END
> +        ret
> +endfunc
> +
> +.macro QPEL_UNI_W_HV_8
> +        sshr            v26.4s, v26.4s, #6
> +        sshr            v27.4s, v27.4s, #6
> +        mul             v24.4s, v26.4s, v28.4s
> +        mul             v25.4s, v27.4s, v28.4s
> +        sqrshl          v24.4s, v24.4s, v30.4s
> +        sqrshl          v25.4s, v25.4s, v30.4s
> +        sqadd           v24.4s, v24.4s, v29.4s
> +        sqadd           v25.4s, v25.4s, v29.4s
> +        sqxtn           v24.4h, v24.4s
> +        sqxtn2          v24.8h, v25.4s
> +        sqxtun          v24.8b, v24.8h
> +        st1             {v24.d}[0], [x20], x21
> +.endm
> +
> +function ff_hevc_put_hevc_qpel_uni_w_hv8_8_neon_dotprod, export=1
> +        QPEL_UNI_W_HV_HEADER 8
> +        ldr             q16, [sp]
> +        ldr             q17, [sp, x10]
> +        add             sp, sp, x10, lsl 1
> +        ldr             q18, [sp]
> +        ldr             q19, [sp, x10]
> +        add             sp, sp, x10, lsl 1
> +        ldr             q20, [sp]
> +        ldr             q21, [sp, x10]
> +        add             sp, sp, x10, lsl 1
> +        ldr             q22, [sp]
> +        add             sp, sp, x10
> +1:
> +        ldr             q23, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v26, v16, v17, v18, v19, v20, v21, v22, v23
> +        QPEL_FILTER_H2  v27, v16, v17, v18, v19, v20, v21, v22, v23
> +        QPEL_UNI_W_HV_8
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldr             q16, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v26, v17, v18, v19, v20, v21, v22, v23, v16
> +        QPEL_FILTER_H2  v27, v17, v18, v19, v20, v21, v22, v23, v16
> +        QPEL_UNI_W_HV_8
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldr             q17, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v26, v18, v19, v20, v21, v22, v23, v16, v17
> +        QPEL_FILTER_H2  v27, v18, v19, v20, v21, v22, v23, v16, v17
> +        QPEL_UNI_W_HV_8
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldr             q18, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v26, v19, v20, v21, v22, v23, v16, v17, v18
> +        QPEL_FILTER_H2  v27, v19, v20, v21, v22, v23, v16, v17, v18
> +        QPEL_UNI_W_HV_8
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldr             q19, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v26, v20, v21, v22, v23, v16, v17, v18, v19
> +        QPEL_FILTER_H2  v27, v20, v21, v22, v23, v16, v17, v18, v19
> +        QPEL_UNI_W_HV_8
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldr             q20, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v26, v21, v22, v23, v16, v17, v18, v19, v20
> +        QPEL_FILTER_H2  v27, v21, v22, v23, v16, v17, v18, v19, v20
> +        QPEL_UNI_W_HV_8
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldr             q21, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v26, v22, v23, v16, v17, v18, v19, v20, v21
> +        QPEL_FILTER_H2  v27, v22, v23, v16, v17, v18, v19, v20, v21
> +        QPEL_UNI_W_HV_8
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldr             q22, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v26, v23, v16, v17, v18, v19, v20, v21, v22
> +        QPEL_FILTER_H2  v27, v23, v16, v17, v18, v19, v20, v21, v22
> +        QPEL_UNI_W_HV_8
> +        subs            w22, w22, #1
> +        b.hi            1b
> +
> +2:
> +        QPEL_UNI_W_HV_END
> +        ret
> +endfunc
> +
> +.macro QPEL_UNI_W_HV_16
> +        sshr            v24.4s, v24.4s, #6
> +        sshr            v25.4s, v25.4s, #6
> +        sshr            v26.4s, v26.4s, #6
> +        sshr            v27.4s, v27.4s, #6
> +        mul             v24.4s, v24.4s, v28.4s
> +        mul             v25.4s, v25.4s, v28.4s
> +        mul             v26.4s, v26.4s, v28.4s
> +        mul             v27.4s, v27.4s, v28.4s
> +        sqrshl          v24.4s, v24.4s, v30.4s
> +        sqrshl          v25.4s, v25.4s, v30.4s
> +        sqrshl          v26.4s, v26.4s, v30.4s
> +        sqrshl          v27.4s, v27.4s, v30.4s
> +        sqadd           v24.4s, v24.4s, v29.4s
> +        sqadd           v25.4s, v25.4s, v29.4s
> +        sqadd           v26.4s, v26.4s, v29.4s
> +        sqadd           v27.4s, v27.4s, v29.4s
> +        sqxtn           v24.4h, v24.4s
> +        sqxtn2          v24.8h, v25.4s
> +        sqxtn           v26.4h, v26.4s
> +        sqxtn2          v26.8h, v27.4s
> +        sqxtun          v24.8b, v24.8h
> +        sqxtun2         v24.16b, v26.8h
> +
> +        st1             {v24.16b}, [x20], x21
> +.endm
> +
> +function ff_hevc_put_hevc_qpel_uni_w_hv16_8_neon_dotprod, export=1
> +        QPEL_UNI_W_HV_HEADER 16
> +        ldp             q16, q1, [sp]
> +        add             sp, sp, x10
> +        ldp             q17, q2, [sp]
> +        add             sp, sp, x10
> +        ldp             q18, q3, [sp]
> +        add             sp, sp, x10
> +        ldp             q19, q4, [sp]
> +        add             sp, sp, x10
> +        ldp             q20, q5, [sp]
> +        add             sp, sp, x10
> +        ldp             q21, q6, [sp]
> +        add             sp, sp, x10
> +        ldp             q22, q7, [sp]
> +        add             sp, sp, x10
> +1:
> +        ldp             q23, q31, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v24, v16, v17, v18, v19, v20, v21, v22, v23
> +        QPEL_FILTER_H2  v25, v16, v17, v18, v19, v20, v21, v22, v23
> +        QPEL_FILTER_H   v26,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
> +        QPEL_FILTER_H2  v27,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
> +        QPEL_UNI_W_HV_16
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldp             q16, q1, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v24, v17, v18, v19, v20, v21, v22, v23, v16
> +        QPEL_FILTER_H2  v25, v17, v18, v19, v20, v21, v22, v23, v16
> +        QPEL_FILTER_H   v26,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
> +        QPEL_FILTER_H2  v27,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
> +        QPEL_UNI_W_HV_16
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldp             q17, q2, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v24, v18, v19, v20, v21, v22, v23, v16, v17
> +        QPEL_FILTER_H2  v25, v18, v19, v20, v21, v22, v23, v16, v17
> +        QPEL_FILTER_H   v26,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
> +        QPEL_FILTER_H2  v27,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
> +        QPEL_UNI_W_HV_16
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldp             q18, q3, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v24, v19, v20, v21, v22, v23, v16, v17, v18
> +        QPEL_FILTER_H2  v25, v19, v20, v21, v22, v23, v16, v17, v18
> +        QPEL_FILTER_H   v26,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
> +        QPEL_FILTER_H2  v27,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
> +        QPEL_UNI_W_HV_16
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldp             q19, q4, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v24, v20, v21, v22, v23, v16, v17, v18, v19
> +        QPEL_FILTER_H2  v25, v20, v21, v22, v23, v16, v17, v18, v19
> +        QPEL_FILTER_H   v26,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
> +        QPEL_FILTER_H2  v27,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
> +        QPEL_UNI_W_HV_16
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldp             q20, q5, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v24, v21, v22, v23, v16, v17, v18, v19, v20
> +        QPEL_FILTER_H2  v25, v21, v22, v23, v16, v17, v18, v19, v20
> +        QPEL_FILTER_H   v26,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
> +        QPEL_FILTER_H2  v27,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
> +        QPEL_UNI_W_HV_16
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldp             q21, q6, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v24, v22, v23, v16, v17, v18, v19, v20, v21
> +        QPEL_FILTER_H2  v25, v22, v23, v16, v17, v18, v19, v20, v21
> +        QPEL_FILTER_H   v26,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
> +        QPEL_FILTER_H2  v27,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
> +        QPEL_UNI_W_HV_16
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldp             q22, q7, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v24, v23, v16, v17, v18, v19, v20, v21, v22
> +        QPEL_FILTER_H2  v25, v23, v16, v17, v18, v19, v20, v21, v22
> +        QPEL_FILTER_H   v26, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
> +        QPEL_FILTER_H2  v27, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
> +        QPEL_UNI_W_HV_16
> +        subs            w22, w22, #1
> +        b.hi            1b
> +
> +2:
> +        QPEL_UNI_W_HV_END
> +        ret
> +endfunc
> +
> +
> +function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_dotprod, export=1
> +        QPEL_UNI_W_HV_HEADER 32
> +        mov             x11, sp
> +        mov             w12, w22
> +        mov             x13, x20
> +3:
> +        ldp             q16, q1, [sp]
> +        add             sp, sp, x10
> +        ldp             q17, q2, [sp]
> +        add             sp, sp, x10
> +        ldp             q18, q3, [sp]
> +        add             sp, sp, x10
> +        ldp             q19, q4, [sp]
> +        add             sp, sp, x10
> +        ldp             q20, q5, [sp]
> +        add             sp, sp, x10
> +        ldp             q21, q6, [sp]
> +        add             sp, sp, x10
> +        ldp             q22, q7, [sp]
> +        add             sp, sp, x10
> +1:
> +        ldp             q23, q31, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v24, v16, v17, v18, v19, v20, v21, v22, v23
> +        QPEL_FILTER_H2  v25, v16, v17, v18, v19, v20, v21, v22, v23
> +        QPEL_FILTER_H   v26,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
> +        QPEL_FILTER_H2  v27,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
> +        QPEL_UNI_W_HV_16
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldp             q16, q1, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v24, v17, v18, v19, v20, v21, v22, v23, v16
> +        QPEL_FILTER_H2  v25, v17, v18, v19, v20, v21, v22, v23, v16
> +        QPEL_FILTER_H   v26,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
> +        QPEL_FILTER_H2  v27,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
> +        QPEL_UNI_W_HV_16
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldp             q17, q2, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v24, v18, v19, v20, v21, v22, v23, v16, v17
> +        QPEL_FILTER_H2  v25, v18, v19, v20, v21, v22, v23, v16, v17
> +        QPEL_FILTER_H   v26,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
> +        QPEL_FILTER_H2  v27,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
> +        QPEL_UNI_W_HV_16
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldp             q18, q3, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v24, v19, v20, v21, v22, v23, v16, v17, v18
> +        QPEL_FILTER_H2  v25, v19, v20, v21, v22, v23, v16, v17, v18
> +        QPEL_FILTER_H   v26,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
> +        QPEL_FILTER_H2  v27,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
> +        QPEL_UNI_W_HV_16
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldp             q19, q4, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v24, v20, v21, v22, v23, v16, v17, v18, v19
> +        QPEL_FILTER_H2  v25, v20, v21, v22, v23, v16, v17, v18, v19
> +        QPEL_FILTER_H   v26,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
> +        QPEL_FILTER_H2  v27,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
> +        QPEL_UNI_W_HV_16
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldp             q20, q5, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v24, v21, v22, v23, v16, v17, v18, v19, v20
> +        QPEL_FILTER_H2  v25, v21, v22, v23, v16, v17, v18, v19, v20
> +        QPEL_FILTER_H   v26,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
> +        QPEL_FILTER_H2  v27,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
> +        QPEL_UNI_W_HV_16
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldp             q21, q6, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v24, v22, v23, v16, v17, v18, v19, v20, v21
> +        QPEL_FILTER_H2  v25, v22, v23, v16, v17, v18, v19, v20, v21
> +        QPEL_FILTER_H   v26,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
> +        QPEL_FILTER_H2  v27,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
> +        QPEL_UNI_W_HV_16
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldp             q22, q7, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v24, v23, v16, v17, v18, v19, v20, v21, v22
> +        QPEL_FILTER_H2  v25, v23, v16, v17, v18, v19, v20, v21, v22
> +        QPEL_FILTER_H   v26, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
> +        QPEL_FILTER_H2  v27, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
> +        QPEL_UNI_W_HV_16
> +        subs            w22, w22, #1
> +        b.hi            1b
> +2:
> +        subs            w27, w27, #16
> +        add             sp, x11, #32
> +        add             x20, x13, #16
> +        mov             w22, w12
> +        mov             x11, sp
> +        mov             x13, x20
> +        b.hi            3b
> +        QPEL_UNI_W_HV_END
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_dotprod, export=1
> +        QPEL_UNI_W_HV_HEADER 64
> +        mov             x11, sp
> +        mov             w12, w22
> +        mov             x13, x20
> +3:
> +        ldp             q16, q1, [sp]
> +        add             sp, sp, x10
> +        ldp             q17, q2, [sp]
> +        add             sp, sp, x10
> +        ldp             q18, q3, [sp]
> +        add             sp, sp, x10
> +        ldp             q19, q4, [sp]
> +        add             sp, sp, x10
> +        ldp             q20, q5, [sp]
> +        add             sp, sp, x10
> +        ldp             q21, q6, [sp]
> +        add             sp, sp, x10
> +        ldp             q22, q7, [sp]
> +        add             sp, sp, x10
> +1:
> +        ldp             q23, q31, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v24, v16, v17, v18, v19, v20, v21, v22, v23
> +        QPEL_FILTER_H2  v25, v16, v17, v18, v19, v20, v21, v22, v23
> +        QPEL_FILTER_H   v26,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
> +        QPEL_FILTER_H2  v27,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
> +        QPEL_UNI_W_HV_16
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldp             q16, q1, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v24, v17, v18, v19, v20, v21, v22, v23, v16
> +        QPEL_FILTER_H2  v25, v17, v18, v19, v20, v21, v22, v23, v16
> +        QPEL_FILTER_H   v26,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
> +        QPEL_FILTER_H2  v27,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
> +        QPEL_UNI_W_HV_16
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldp             q17, q2, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v24, v18, v19, v20, v21, v22, v23, v16, v17
> +        QPEL_FILTER_H2  v25, v18, v19, v20, v21, v22, v23, v16, v17
> +        QPEL_FILTER_H   v26,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
> +        QPEL_FILTER_H2  v27,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
> +        QPEL_UNI_W_HV_16
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldp             q18, q3, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v24, v19, v20, v21, v22, v23, v16, v17, v18
> +        QPEL_FILTER_H2  v25, v19, v20, v21, v22, v23, v16, v17, v18
> +        QPEL_FILTER_H   v26,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
> +        QPEL_FILTER_H2  v27,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
> +        QPEL_UNI_W_HV_16
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldp             q19, q4, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v24, v20, v21, v22, v23, v16, v17, v18, v19
> +        QPEL_FILTER_H2  v25, v20, v21, v22, v23, v16, v17, v18, v19
> +        QPEL_FILTER_H   v26,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
> +        QPEL_FILTER_H2  v27,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
> +        QPEL_UNI_W_HV_16
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldp             q20, q5, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v24, v21, v22, v23, v16, v17, v18, v19, v20
> +        QPEL_FILTER_H2  v25, v21, v22, v23, v16, v17, v18, v19, v20
> +        QPEL_FILTER_H   v26,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
> +        QPEL_FILTER_H2  v27,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
> +        QPEL_UNI_W_HV_16
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldp             q21, q6, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v24, v22, v23, v16, v17, v18, v19, v20, v21
> +        QPEL_FILTER_H2  v25, v22, v23, v16, v17, v18, v19, v20, v21
> +        QPEL_FILTER_H   v26,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
> +        QPEL_FILTER_H2  v27,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
> +        QPEL_UNI_W_HV_16
> +        subs            w22, w22, #1
> +        b.eq            2f
> +
> +        ldp             q22, q7, [sp]
> +        add             sp, sp, x10
> +        QPEL_FILTER_H   v24, v23, v16, v17, v18, v19, v20, v21, v22
> +        QPEL_FILTER_H2  v25, v23, v16, v17, v18, v19, v20, v21, v22
> +        QPEL_FILTER_H   v26, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
> +        QPEL_FILTER_H2  v27, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
> +        QPEL_UNI_W_HV_16
> +        subs            w22, w22, #1
> +        b.hi            1b
> +2:
> +        subs            w27, w27, #16
> +        add             sp, x11, #32
> +        add             x20, x13, #16
> +        mov             w22, w12
> +        mov             x11, sp
> +        mov             x13, x20
> +        b.hi            3b
> +        QPEL_UNI_W_HV_END
> +        ret
> +endfunc
> +
> +#endif // __ARM_FEATURE_DOTPROD
> \ No newline at end of file
> -- 
> 2.38.0.windows.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Logan.Lyu May 3, 2023, 2:14 a.m. UTC | #2
Hello,

- I splited this patch, Do I need to resubmit or just attach them as 
attachments? (I attached those patches.  If I need to resummit, please 
let me know.)

- Those functions are checked by checkasm already.

Thanks.


在 2023/5/2 20:32, Jean-Baptiste Kempf 写道:
> Hello,
>
> Just 2 questions:
> - could you split this patch into several (3,4 or 5)
> - are all those functions checked by chekasm?
>
> Thanks,
>
> jb
>
> On Sun, 30 Apr 2023, at 10:57, myais wrote:
>> Hi,
>> This is a patch for the aarch64, which completes the neon versions of
>> the hevc_pel_uni_w_pixels, qpel_uni_w_h, qpel_uni_w_v, qpel_uni_w_hv
>> interfaces.
>>
>> put_hevc_pel_uni_w_pixels4_8_c: 54.3
>> put_hevc_pel_uni_w_pixels4_8_neon: 24.1
>> put_hevc_pel_uni_w_pixels6_8_c: 105.3
>> put_hevc_pel_uni_w_pixels6_8_neon: 53.1
>> put_hevc_pel_uni_w_pixels8_8_c: 176.6
>> put_hevc_pel_uni_w_pixels8_8_neon: 63.8
>> put_hevc_pel_uni_w_pixels12_8_c: 391.1
>> put_hevc_pel_uni_w_pixels12_8_neon: 193.3
>> put_hevc_pel_uni_w_pixels16_8_c: 688.1
>> put_hevc_pel_uni_w_pixels16_8_neon: 226.1
>> put_hevc_pel_uni_w_pixels24_8_c: 1542.3
>> put_hevc_pel_uni_w_pixels24_8_neon: 536.8
>> put_hevc_pel_uni_w_pixels32_8_c: 2753.1
>> put_hevc_pel_uni_w_pixels32_8_neon: 875.8
>> put_hevc_pel_uni_w_pixels48_8_c: 6251.1
>> put_hevc_pel_uni_w_pixels48_8_neon: 1966.1
>> put_hevc_pel_uni_w_pixels64_8_c: 11047.1
>> put_hevc_pel_uni_w_pixels64_8_neon: 3449.8
>>
>> put_hevc_qpel_uni_w_h4_8_c: 156.6
>> put_hevc_qpel_uni_w_h4_8_neon: 44.6
>> put_hevc_qpel_uni_w_h6_8_c: 324.6
>> put_hevc_qpel_uni_w_h6_8_neon: 103.1
>> put_hevc_qpel_uni_w_h8_8_c: 549.3
>> put_hevc_qpel_uni_w_h8_8_neon: 138.6
>> put_hevc_qpel_uni_w_h12_8_c: 1240.3
>> put_hevc_qpel_uni_w_h12_8_neon: 277.3
>> put_hevc_qpel_uni_w_h16_8_c: 2161.8
>> put_hevc_qpel_uni_w_h16_8_neon: 394.1
>> put_hevc_qpel_uni_w_h24_8_c: 4874.8
>> put_hevc_qpel_uni_w_h24_8_neon: 972.6
>> put_hevc_qpel_uni_w_h32_8_c: 8517.8
>> put_hevc_qpel_uni_w_h32_8_neon: 1517.3
>> put_hevc_qpel_uni_w_h48_8_c: 19856.1
>> put_hevc_qpel_uni_w_h48_8_neon: 3429.8
>> put_hevc_qpel_uni_w_h64_8_c: 35159.3
>> put_hevc_qpel_uni_w_h64_8_neon: 6018.1
>>
>> put_hevc_qpel_uni_w_v4_8_c: 180.6
>> put_hevc_qpel_uni_w_v4_8_neon: 63.8
>> put_hevc_qpel_uni_w_v6_8_c: 318.6
>> put_hevc_qpel_uni_w_v6_8_neon: 117.8
>> put_hevc_qpel_uni_w_v8_8_c: 547.6
>> put_hevc_qpel_uni_w_v8_8_neon: 132.1
>> put_hevc_qpel_uni_w_v12_8_c: 1202.8
>> put_hevc_qpel_uni_w_v12_8_neon: 350.1
>> put_hevc_qpel_uni_w_v16_8_c: 2109.6
>> put_hevc_qpel_uni_w_v16_8_neon: 442.1
>> put_hevc_qpel_uni_w_v24_8_c: 4748.8
>> put_hevc_qpel_uni_w_v24_8_neon: 1287.1
>> put_hevc_qpel_uni_w_v32_8_c: 8487.3
>> put_hevc_qpel_uni_w_v32_8_neon: 1704.3
>> put_hevc_qpel_uni_w_v48_8_c: 18798.8
>> put_hevc_qpel_uni_w_v48_8_neon: 3790.8
>> put_hevc_qpel_uni_w_v64_8_c: 35614.6
>> put_hevc_qpel_uni_w_v64_8_neon: 6725.6
>>
>>
>> put_hevc_qpel_uni_w_hv4_8_c: 498.8
>> put_hevc_qpel_uni_w_hv4_8_neon: 139.3
>> put_hevc_qpel_uni_w_hv6_8_c: 874.6
>> put_hevc_qpel_uni_w_hv6_8_neon: 295.3
>> put_hevc_qpel_uni_w_hv8_8_c: 1372.1
>> put_hevc_qpel_uni_w_hv8_8_neon: 387.1
>> put_hevc_qpel_uni_w_hv12_8_c: 2721.8
>> put_hevc_qpel_uni_w_hv12_8_neon: 804.8
>> put_hevc_qpel_uni_w_hv16_8_c: 4503.1
>> put_hevc_qpel_uni_w_hv16_8_neon: 1038.1
>> put_hevc_qpel_uni_w_hv24_8_c: 9321.8
>> put_hevc_qpel_uni_w_hv24_8_neon: 2962.1
>> put_hevc_qpel_uni_w_hv32_8_c: 15926.8
>> put_hevc_qpel_uni_w_hv32_8_neon: 3858.6
>> put_hevc_qpel_uni_w_hv48_8_c: 35051.1
>> put_hevc_qpel_uni_w_hv48_8_neon: 9301.1
>> put_hevc_qpel_uni_w_hv64_8_c: 61215.3
>> put_hevc_qpel_uni_w_hv64_8_neon: 14920.1
>>
>> put_hevc_qpel_uni_h4_8_c: 143.3
>> put_hevc_qpel_uni_h4_8_neon: 55.3
>> put_hevc_qpel_uni_h6_8_c: 304.6
>> put_hevc_qpel_uni_h6_8_neon: 82.3
>> put_hevc_qpel_uni_h8_8_c: 557.8
>> put_hevc_qpel_uni_h8_8_neon: 99.3
>> put_hevc_qpel_uni_h12_8_c: 1228.3
>> put_hevc_qpel_uni_h12_8_neon: 251.6
>> put_hevc_qpel_uni_h16_8_c: 2210.3
>> put_hevc_qpel_uni_h16_8_neon: 324.6
>> put_hevc_qpel_uni_h24_8_c: 4859.1
>> put_hevc_qpel_uni_h24_8_neon: 962.3
>> put_hevc_qpel_uni_h32_8_c: 8728.6
>> put_hevc_qpel_uni_h32_8_neon: 1249.6
>> put_hevc_qpel_uni_h48_8_c: 20346.3
>> put_hevc_qpel_uni_h48_8_neon: 2824.1
>> put_hevc_qpel_uni_h64_8_c: 36702.6
>> put_hevc_qpel_uni_h64_8_neon: 5012.1
>>
>>
>>
>>
>> Signed-off-by: myais <Logan.Lyu@myais.com.cn>
>> ---
>>    libavcodec/aarch64/hevcdsp_init_aarch64.c |   96 +
>>    libavcodec/aarch64/hevcdsp_qpel_neon.S    | 2223 +++++++++++++++++++++
>>    2 files changed, 2319 insertions(+)
>>
>> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c
>> b/libavcodec/aarch64/hevcdsp_init_aarch64.c
>> index be1049a2ec..42b8e9169d 100644
>> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
>> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
>> @@ -128,6 +128,91 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t
>> *_dst, ptrdiff_t _dststride, co
>>                                             ptrdiff_t _srcstride, const
>> int16_t *src2, int height, intptr_t
>>                                             mx, intptr_t my, int width);
>>    +#define NEON8_FNPROTO(fn, args, ext) \
>> +    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##6_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##12_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##24_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##48_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
>> +
>> +#define NEON8_FNPROTO_PARTIAL_4(fn, args, ext) \
>> +    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
>> +
>> +#define NEON8_FNPROTO_PARTIAL_5(fn, args, ext) \
>> +    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
>> +
>> +
>> +NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
>> +        const uint8_t *_src, ptrdiff_t _srcstride,
>> +        int height, int denom, int wx, int ox, +        intptr_t mx,
>> intptr_t my, int width),);
>> +
>> +NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t
>> _dststride,
>> +        const uint8_t *_src, ptrdiff_t _srcstride,
>> +        int height, int denom, int wx, int ox,
>> +        intptr_t mx, intptr_t my, int width),);
>> +
>> +#if defined(__ARM_FEATURE_DOTPROD)
>> +NEON8_FNPROTO(qpel_h, (int16_t *dst,
>> +        const uint8_t *_src, ptrdiff_t _srcstride,
>> +        int height, intptr_t mx, intptr_t my, int width), _dotprod);
>> +
>> +NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
>> +        const uint8_t *_src, ptrdiff_t _srcstride,
>> +        int height, int denom, int wx, int ox,
>> +        intptr_t mx, intptr_t my, int width), _dotprod);
>> +
>> +NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t
>> _dststride,
>> +        const uint8_t *_src, ptrdiff_t _srcstride,
>> +        int height, int denom, int wx, int ox,
>> +        intptr_t mx, intptr_t my, int width), _dotprod);
>> +
>> +#endif
>> +
>> +#define NEON8_FNASSIGN(member, v, h, fn, ext) \
>> +        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
>> +        member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext;  \
>> +        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
>> +        member[4][v][h] = ff_hevc_put_hevc_##fn##12_8_neon##ext; \
>> +        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
>> +        member[6][v][h] = ff_hevc_put_hevc_##fn##24_8_neon##ext; \
>> +        member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
>> +        member[8][v][h] = ff_hevc_put_hevc_##fn##48_8_neon##ext; \
>> +        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
>> +
>> +#define NEON8_FNASSIGN_PARTIAL_4(member, v, h, fn, ext) \
>> +        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
>> +        member[2][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
>> +        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
>> +        member[4][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
>> +        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
>> +        member[6][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
>> +        member[7][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
>> +        member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
>> +        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
>> +
>> +#define NEON8_FNASSIGN_PARTIAL_5(member, v, h, fn, ext) \
>> +        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
>> +        member[2][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
>> +        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
>> +        member[4][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
>> +        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
>> +        member[6][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
>> +        member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
>> +        member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
>> +        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
>> +
>>    av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int
>> bit_depth)
>>    {
>>        if (!have_neon(av_get_cpu_flags())) return;
>> @@ -185,6 +270,17 @@ av_cold void
>> ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
>>            c->put_hevc_qpel_bi[7][0][1]   =
>>            c->put_hevc_qpel_bi[8][0][1]   =
>>            c->put_hevc_qpel_bi[9][0][1]   =
>> ff_hevc_put_hevc_qpel_bi_h16_8_neon;
>> +
>> +        NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
>> +        NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0,
>> pel_uni_w_pixels,); +
>> NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
>> +
>> +    #if defined(__ARM_FEATURE_DOTPROD)
>> +        NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _dotprod);
>> +        NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h,
>> _dotprod);
>> +        NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1,
>> qpel_uni_w_hv, _dotprod);
>> +
>> +    #endif
>>        }
>>        if (bit_depth == 10) {
>>            c->hevc_h_loop_filter_chroma   =
>> ff_hevc_h_loop_filter_chroma_10_neon;
>> diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S
>> b/libavcodec/aarch64/hevcdsp_qpel_neon.S
>> index 0e7b912678..e30ac1b465 100644
>> --- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
>> +++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
>> @@ -30,6 +30,13 @@ const qpel_filters, align=4
>>            .byte           0,  1, -5, 17, 58,-10, 4, -1
>>    endconst
>>    +const qpel_filters_abs, align=4
>> +        .byte           0,  0,  0,  0,  0,  0, 0,  0
>> +        .byte           1,  4, 10, 58, 17,  5, 1,  0
>> +        .byte           1,  4, 11, 40, 40, 11, 4,  1
>> +        .byte           0,  1,  5, 17, 58, 10, 4,  1
>> +endconst
>> +
>>    .macro load_filter m
>>            movrel          x15, qpel_filters
>>            add             x15, x15, \m, lsl #3
>> @@ -482,3 +489,2219 @@ endfunc
>>    put_hevc qpel
>>    put_hevc qpel_uni
>>    put_hevc qpel_bi
>> +
>> +
>> +function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
>> +        mov     w10, #-6
>> +        sub     w10, w10, w5
>> +        dup     v30.8h, w6
>> +        dup     v31.4s, w10
>> +        dup     v29.8h, w7
>> +1:
>> +        ldr     s0, [x2]
>> +        ldr     s1, [x2, x3]
>> +        add     x2, x2, x3, lsl 1
>> +        ushll   v0.8h, v0.8b, #6
>> +        ushll   v1.8h, v1.8b, #6
>> +        smull   v0.4s, v0.4h, v30.4h
>> +        smull   v1.4s, v1.4h, v30.4h
>> +        sqrshl  v0.4s, v0.4s, v31.4s
>> +        sqrshl  v1.4s, v1.4s, v31.4s
>> +        sqadd   v0.4s, v0.4s, v29.4s
>> +        sqadd   v1.4s, v1.4s, v29.4s
>> +        sqxtn  v0.4h, v0.4s
>> +        sqxtn  v1.4h, v1.4s
>> +        sqxtun  v0.8b, v0.8h
>> +        sqxtun  v1.8b, v1.8h
>> +        str     s0, [x0]
>> +        str     s1, [x0, x1]
>> +        add     x0, x0, x1, lsl 1
>> +        subs    w4, w4, #2
>> +        b.ne    1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_pel_uni_w_pixels6_8_neon, export=1
>> +        mov     w10, #-6
>> +        sub     w10, w10, w5
>> +        dup     v30.8h, w6
>> +        dup     v31.4s, w10
>> +        dup     v29.4s, w7
>> +        sub     x1, x1, #4
>> +1:
>> +        ldr     d0, [x2]
>> +        ldr     d1, [x2, x3]
>> +        add     x2, x2, x3, lsl 1
>> +        ushll   v0.8h, v0.8b, #6
>> +        ushll   v1.8h, v1.8b, #6
>> +        smull   v4.4s, v0.4h, v30.4h
>> +        smull2  v5.4s, v0.8h, v30.8h
>> +        smull   v6.4s, v1.4h, v30.4h
>> +        smull2  v7.4s, v1.8h, v30.8h
>> +        sqrshl  v4.4s, v4.4s, v31.4s
>> +        sqrshl  v5.4s, v5.4s, v31.4s
>> +        sqrshl  v6.4s, v6.4s, v31.4s
>> +        sqrshl  v7.4s, v7.4s, v31.4s
>> +        sqadd   v4.4s, v4.4s, v29.4s
>> +        sqadd   v5.4s, v5.4s, v29.4s
>> +        sqadd   v6.4s, v6.4s, v29.4s
>> +        sqadd   v7.4s, v7.4s, v29.4s
>> +        sqxtn   v0.4h, v4.4s
>> +        sqxtn2  v0.8h, v5.4s
>> +        sqxtn   v1.4h, v6.4s
>> +        sqxtn2  v1.8h, v7.4s
>> +        sqxtun  v0.8b, v0.8h
>> +        sqxtun  v1.8b, v1.8h
>> +        str     s0, [x0], #4
>> +        st1     {v0.h}[2], [x0], x1
>> +        str     s1, [x0], #4
>> +        st1     {v1.h}[2], [x0], x1
>> +        subs    w4, w4, #2
>> +        b.ne    1b
>> +        ret +endfunc
>> +
>> +function ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon, export=1
>> +        mov     w10, #-6
>> +        sub     w10, w10, w5
>> +        dup     v30.8h, w6
>> +        dup     v31.4s, w10
>> +        dup     v29.4s, w7
>> +1:
>> +        ldr     d0, [x2]
>> +        ldr     d1, [x2, x3]
>> +        add     x2, x2, x3, lsl 1
>> +        ushll   v0.8h, v0.8b, #6
>> +        ushll   v1.8h, v1.8b, #6
>> +        smull   v4.4s, v0.4h, v30.4h
>> +        smull2  v5.4s, v0.8h, v30.8h
>> +        smull   v6.4s, v1.4h, v30.4h
>> +        smull2  v7.4s, v1.8h, v30.8h
>> +        sqrshl  v4.4s, v4.4s, v31.4s
>> +        sqrshl  v5.4s, v5.4s, v31.4s
>> +        sqrshl  v6.4s, v6.4s, v31.4s
>> +        sqrshl  v7.4s, v7.4s, v31.4s
>> +        sqadd   v4.4s, v4.4s, v29.4s
>> +        sqadd   v5.4s, v5.4s, v29.4s
>> +        sqadd   v6.4s, v6.4s, v29.4s
>> +        sqadd   v7.4s, v7.4s, v29.4s
>> +        sqxtn   v0.4h, v4.4s
>> +        sqxtn2  v0.8h, v5.4s
>> +        sqxtn   v1.4h, v6.4s
>> +        sqxtn2  v1.8h, v7.4s
>> +        sqxtun  v0.8b, v0.8h
>> +        sqxtun  v1.8b, v1.8h
>> +        str     d0, [x0]
>> +        str     d1, [x0, x1]
>> +        add     x0, x0, x1, lsl 1
>> +        subs    w4, w4, #2
>> +        b.ne    1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_pel_uni_w_pixels12_8_neon, export=1
>> +        mov     w10, #-6
>> +        sub     w10, w10, w5
>> +        dup     v30.8h, w6
>> +        dup     v31.4s, w10
>> +        dup     v29.4s, w7
>> +        sub     x1, x1, #8
>> +1:
>> +        ldr     q0, [x2]
>> +        ldr     q1, [x2, x3]
>> +        add     x2, x2, x3, lsl 1
>> +        ushll   v4.8h, v0.8b, #6
>> +        ushll2  v5.8h, v0.16b, #6
>> +        ushll   v6.8h, v1.8b, #6
>> +        ushll2  v7.8h, v1.16b, #6
>> +        smull   v16.4s, v4.4h, v30.4h
>> +        smull2  v17.4s, v4.8h, v30.8h
>> +        smull   v18.4s, v5.4h, v30.4h
>> +        smull2  v19.4s, v5.8h, v30.8h
>> +        smull   v20.4s, v6.4h, v30.4h
>> +        smull2  v21.4s, v6.8h, v30.8h
>> +        smull   v22.4s, v7.4h, v30.4h
>> +        smull2  v23.4s, v7.8h, v30.8h
>> +        +        sqrshl  v16.4s, v16.4s, v31.4s
>> +        sqrshl  v17.4s, v17.4s, v31.4s
>> +        sqrshl  v18.4s, v18.4s, v31.4s
>> +        sqrshl  v19.4s, v19.4s, v31.4s
>> +        sqrshl  v20.4s, v20.4s, v31.4s
>> +        sqrshl  v21.4s, v21.4s, v31.4s
>> +        sqrshl  v22.4s, v22.4s, v31.4s
>> +        sqrshl  v23.4s, v23.4s, v31.4s
>> +        sqadd   v16.4s, v16.4s, v29.4s
>> +        sqadd   v17.4s, v17.4s, v29.4s
>> +        sqadd   v18.4s, v18.4s, v29.4s
>> +        sqadd   v19.4s, v19.4s, v29.4s
>> +        sqadd   v20.4s, v20.4s, v29.4s
>> +        sqadd   v21.4s, v21.4s, v29.4s
>> +        sqadd   v22.4s, v22.4s, v29.4s
>> +        sqadd   v23.4s, v23.4s, v29.4s
>> +        sqxtn   v0.4h, v16.4s
>> +        sqxtn2  v0.8h, v17.4s
>> +        sqxtn   v1.4h, v18.4s
>> +        sqxtn2  v1.8h, v19.4s
>> +        sqxtn   v2.4h, v20.4s
>> +        sqxtn2  v2.8h, v21.4s
>> +        sqxtn   v3.4h, v22.4s
>> +        sqxtn2  v3.8h, v23.4s
>> +        sqxtun  v0.8b, v0.8h
>> +        sqxtun2 v0.16b, v1.8h
>> +        sqxtun  v2.8b, v2.8h
>> +        sqxtun2 v2.16b, v3.8h
>> +        str     d0, [x0], #8
>> +        st1     {v0.s}[2], [x0], x1
>> +        str     d2, [x0], #8
>> +        st1     {v2.s}[2], [x0], x1
>> +        subs    w4, w4, #2
>> +        b.ne    1b
>> +        ret
>> +endfunc
>> +
>> +.macro PEL_UNI_W_PIXEL_CALC     s0, t0, t1, d0, d1, d2, d3
>> +        ushll   \t0\().8h, \s0\().8b, #6
>> +        ushll2  \t1\().8h, \s0\().16b, #6
>> +        smull   \d0\().4s, \t0\().4h, v30.4h
>> +        smull2  \d1\().4s, \t0\().8h, v30.8h
>> +        smull   \d2\().4s, \t1\().4h, v30.4h
>> +        smull2  \d3\().4s, \t1\().8h, v30.8h
>> +        sqrshl  \d0\().4s, \d0\().4s, v31.4s
>> +        sqrshl  \d1\().4s, \d1\().4s, v31.4s
>> +        sqrshl  \d2\().4s, \d2\().4s, v31.4s
>> +        sqrshl  \d3\().4s, \d3\().4s, v31.4s
>> +        sqadd   \d0\().4s, \d0\().4s, v29.4s
>> +        sqadd   \d1\().4s, \d1\().4s, v29.4s
>> +        sqadd   \d2\().4s, \d2\().4s, v29.4s
>> +        sqadd   \d3\().4s, \d3\().4s, v29.4s
>> +        sqxtn   \t0\().4h, \d0\().4s
>> +        sqxtn2  \t0\().8h, \d1\().4s
>> +        sqxtn   \t1\().4h, \d2\().4s
>> +        sqxtn2  \t1\().8h, \d3\().4s
>> +        sqxtun  \s0\().8b,  \t0\().8h
>> +        sqxtun2 \s0\().16b, \t1\().8h
>> +.endm
>> +
>> +
>> +function ff_hevc_put_hevc_pel_uni_w_pixels16_8_neon, export=1
>> +        mov     w10, #-6
>> +        sub     w10, w10, w5
>> +        dup     v30.8h, w6
>> +        dup     v31.4s, w10
>> +        dup     v29.4s, w7
>> +1:
>> +        ldr     q0, [x2]
>> +        ldr     q1, [x2, x3]
>> +        add     x2, x2, x3, lsl 1
>> +        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
>> +        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
>> +        str     q0, [x0]
>> +        str     q1, [x0, x1]
>> +        add     x0, x0, x1, lsl 1
>> +        subs    w4, w4, #2
>> +        b.ne    1b
>> +        ret
>> +endfunc
>> +
>> +
>> +
>> +function ff_hevc_put_hevc_pel_uni_w_pixels24_8_neon, export=1
>> +        mov     w10, #-6
>> +        sub     w10, w10, w5
>> +        dup     v30.8h, w6
>> +        dup     v31.4s, w10
>> +        dup     v29.4s, w7
>> +1:
>> +        ld1     {v0.16b, v1.16b}, [x2], x3
>> +        ushll   v4.8h, v0.8b, #6
>> +        ushll2  v5.8h, v0.16b, #6
>> +        ushll   v6.8h, v1.8b, #6
>> +        smull   v16.4s, v4.4h, v30.4h
>> +        smull2  v17.4s, v4.8h, v30.8h
>> +        smull   v18.4s, v5.4h, v30.4h
>> +        smull2  v19.4s, v5.8h, v30.8h
>> +        smull   v20.4s, v6.4h, v30.4h
>> +        smull2  v21.4s, v6.8h, v30.8h
>> +        sqrshl  v16.4s, v16.4s, v31.4s
>> +        sqrshl  v17.4s, v17.4s, v31.4s
>> +        sqrshl  v18.4s, v18.4s, v31.4s
>> +        sqrshl  v19.4s, v19.4s, v31.4s
>> +        sqrshl  v20.4s, v20.4s, v31.4s
>> +        sqrshl  v21.4s, v21.4s, v31.4s
>> +        sqadd   v16.4s, v16.4s, v29.4s
>> +        sqadd   v17.4s, v17.4s, v29.4s
>> +        sqadd   v18.4s, v18.4s, v29.4s
>> +        sqadd   v19.4s, v19.4s, v29.4s
>> +        sqadd   v20.4s, v20.4s, v29.4s
>> +        sqadd   v21.4s, v21.4s, v29.4s
>> +        sqxtn   v0.4h, v16.4s
>> +        sqxtn2  v0.8h, v17.4s
>> +        sqxtn   v1.4h, v18.4s
>> +        sqxtn2  v1.8h, v19.4s
>> +        sqxtn   v2.4h, v20.4s
>> +        sqxtn2  v2.8h, v21.4s
>> +        sqxtun  v0.8b, v0.8h
>> +        sqxtun  v1.8b, v1.8h
>> +        sqxtun  v2.8b, v2.8h
>> +        st1     {v0.8b, v1.8b, v2.8b}, [x0], x1
>> +        subs    w4, w4, #1
>> +        b.ne    1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_pel_uni_w_pixels32_8_neon, export=1
>> +        mov     w10, #-6
>> +        sub     w10, w10, w5
>> +        dup     v30.8h, w6
>> +        dup     v31.4s, w10
>> +        dup     v29.4s, w7
>> +1:
>> +        ld1     {v0.16b, v1.16b}, [x2], x3
>> +        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
>> +        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
>> +        st1     {v0.16b, v1.16b}, [x0], x1
>> +        subs    w4, w4, #1
>> +        b.ne    1b
>> +        ret
>> +endfunc
>> +
>> +
>> +function ff_hevc_put_hevc_pel_uni_w_pixels48_8_neon, export=1
>> +        mov     w10, #-6
>> +        sub     w10, w10, w5
>> +        dup     v30.8h, w6
>> +        dup     v31.4s, w10
>> +        dup     v29.4s, w7
>> +1:
>> +        ld1     {v0.16b, v1.16b, v2.16b}, [x2], x3
>> +        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
>> +        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
>> +        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
>> +        st1     {v0.16b, v1.16b, v2.16b}, [x0], x1
>> +        subs    w4, w4, #1
>> +        b.ne    1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_pel_uni_w_pixels64_8_neon, export=1
>> +        mov     w10, #-6
>> +        sub     w10, w10, w5
>> +        dup     v30.8h, w6
>> +        dup     v31.4s, w10
>> +        dup     v29.4s, w7
>> +1:
>> +        ld1     {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
>> +        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
>> +        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
>> +        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
>> +        PEL_UNI_W_PIXEL_CALC v3, v6, v7, v20, v21, v22, v23
>> +        st1     {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
>> +        subs    w4, w4, #1
>> +        b.ne    1b
>> +        ret
>> +endfunc
>> +
>> +.macro QPEL_UNI_W_V_HEADER
>> +        ldur            x12, [sp, #8]          // my
>> +        sub             x2, x2, x3, lsl #1
>> +        sub             x2, x2, x3
>> +        movrel          x9, qpel_filters_abs
>> +        add             x9, x9, x12, lsl 3
>> +        ldr             d28, [x9]
>> +        dup             v0.16b, v28.b[0]
>> +        dup             v1.16b, v28.b[1]
>> +        dup             v2.16b, v28.b[2]
>> +        dup             v3.16b, v28.b[3]
>> +        dup             v4.16b, v28.b[4]
>> +        dup             v5.16b, v28.b[5]
>> +        dup             v6.16b, v28.b[6]
>> +        dup             v7.16b, v28.b[7]
>> +
>> +        mov             w10, #-6
>> +        sub             w10, w10, w5
>> +        dup             v30.8h, w6              // wx
>> +        dup             v31.4s, w10             // shift
>> +        dup             v29.4s, w7              // ox
>> +.endm
>> +
>> +.macro QPEL_FILTER_B dst, src0, src1, src2, src3, src4, src5, src6, src7
>> +        umull           \dst\().8h, \src1\().8b, v1.8b
>> +        umlsl           \dst\().8h, \src0\().8b, v0.8b
>> +        umlsl           \dst\().8h, \src2\().8b, v2.8b
>> +        umlal           \dst\().8h, \src3\().8b, v3.8b
>> +        umlal           \dst\().8h, \src4\().8b, v4.8b
>> +        umlsl           \dst\().8h, \src5\().8b, v5.8b
>> +        umlal           \dst\().8h, \src6\().8b, v6.8b
>> +        umlsl           \dst\().8h, \src7\().8b, v7.8b
>> +.endm
>> +
>> +.macro QPEL_FILTER_B2 dst, src0, src1, src2, src3, src4, src5, src6, src7
>> +        umull2          \dst\().8h, \src1\().16b, v1.16b
>> +        umlsl2          \dst\().8h, \src0\().16b, v0.16b
>> +        umlsl2          \dst\().8h, \src2\().16b, v2.16b
>> +        umlal2          \dst\().8h, \src3\().16b, v3.16b
>> +        umlal2          \dst\().8h, \src4\().16b, v4.16b
>> +        umlsl2          \dst\().8h, \src5\().16b, v5.16b
>> +        umlal2          \dst\().8h, \src6\().16b, v6.16b
>> +        umlsl2          \dst\().8h, \src7\().16b, v7.16b
>> +.endm
>> +
>> +.macro  QPEL_UNI_W_V_4
>> +        smull           v24.4s, v24.4h, v30.4h
>> +        sqrshl          v24.4s, v24.4s, v31.4s
>> +        sqadd           v24.4s, v24.4s, v29.4s
>> +        sqxtn           v24.4h, v24.4s
>> +        sqxtun          v24.8b, v24.8h
>> +        st1             {v24.s}[0], [x0], x1
>> +.endm
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_v4_8_neon, export=1
>> +        QPEL_UNI_W_V_HEADER
>> +        ldr             s16, [x2]
>> +        ldr             s17, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        ldr             s18, [x2]
>> +        ldr             s19, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        ldr             s20, [x2]
>> +        ldr             s21, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        ldr             s22, [x2]
>> +
>> +1:      ldr             s23, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v24, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_UNI_W_V_4
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             s16, [x2]
>> +        QPEL_FILTER_B     v24, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_UNI_W_V_4
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             s17, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v24, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_UNI_W_V_4
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             s18, [x2]
>> +        QPEL_FILTER_B     v24, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_UNI_W_V_4
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             s19, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v24, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_UNI_W_V_4
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             s20, [x2]
>> +        QPEL_FILTER_B     v24, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_UNI_W_V_4
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             s21, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v24, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_UNI_W_V_4
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             s22, [x2]
>> +        QPEL_FILTER_B     v24, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_UNI_W_V_4
>> +        subs            w4, w4, #1
>> +        b.ne            1b
>> +2:
>> +        ret
>> +endfunc
>> +
>> +.macro QPEL_UNI_W_V_8
>> +        smull           v24.4s, v26.4h, v30.4h
>> +        smull2          v25.4s, v26.8h, v30.8h
>> +        sqrshl          v24.4s, v24.4s, v31.4s
>> +        sqrshl          v25.4s, v25.4s, v31.4s
>> +        sqadd           v24.4s, v24.4s, v29.4s
>> +        sqadd           v25.4s, v25.4s, v29.4s
>> +        sqxtn           v24.4h, v24.4s
>> +        sqxtn2          v24.8h, v25.4s
>> +        sqxtun          v24.8b, v24.8h
>> +        st1             {v24.d}[0], [x0], x1
>> +.endm
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_v8_8_neon, export=1
>> +        QPEL_UNI_W_V_HEADER
>> +        ldr             d16, [x2]
>> +        ldr             d17, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        ldr             d18, [x2]
>> +        ldr             d19, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        ldr             d20, [x2]
>> +        ldr             d21, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        ldr             d22, [x2]
>> +
>> +1:      ldr             d23, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_UNI_W_V_8
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             d16, [x2]
>> +        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_UNI_W_V_8
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             d17, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_UNI_W_V_8
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             d18, [x2]
>> +        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_UNI_W_V_8
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             d19, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_UNI_W_V_8
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             d20, [x2]
>> +        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_UNI_W_V_8
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             d21, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_UNI_W_V_8
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             d22, [x2]
>> +        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_UNI_W_V_8
>> +        subs            w4, w4, #1
>> +        b.ne            1b
>> +2:
>> +        ret
>> +endfunc
>> +
>> +.macro QPEL_UNI_W_V_16
>> +        smull           v24.4s, v26.4h, v30.4h
>> +        smull2          v25.4s, v26.8h, v30.8h
>> +        smull           v26.4s, v27.4h, v30.4h
>> +        smull2          v27.4s, v27.8h, v30.8h
>> +        sqrshl          v24.4s, v24.4s, v31.4s
>> +        sqrshl          v25.4s, v25.4s, v31.4s
>> +        sqrshl          v26.4s, v26.4s, v31.4s
>> +        sqrshl          v27.4s, v27.4s, v31.4s
>> +        sqadd           v24.4s, v24.4s, v29.4s
>> +        sqadd           v25.4s, v25.4s, v29.4s
>> +        sqadd           v26.4s, v26.4s, v29.4s
>> +        sqadd           v27.4s, v27.4s, v29.4s
>> +        sqxtn           v24.4h, v24.4s
>> +        sqxtn2          v24.8h, v25.4s
>> +        sqxtn           v26.4h, v26.4s
>> +        sqxtn2          v26.8h, v27.4s
>> +        sqxtun          v24.8b, v24.8h
>> +        sqxtun2         v24.16b, v26.8h
>> +        st1             {v24.16b}, [x0], x1
>> +.endm
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_v16_8_neon, export=1
>> +        QPEL_UNI_W_V_HEADER
>> +        ldr             q16, [x2]
>> +        ldr             q17, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        ldr             q18, [x2]
>> +        ldr             q19, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        ldr             q20, [x2]
>> +        ldr             q21, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        ldr             q22, [x2]
>> +
>> +1:      ldr             q23, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_FILTER_B2    v27, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q16, [x2]
>> +        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_FILTER_B2    v27, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q17, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_FILTER_B2    v27, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q18, [x2]
>> +        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_FILTER_B2    v27, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q19, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_FILTER_B2    v27, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q20, [x2]
>> +        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_FILTER_B2    v27, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q21, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_FILTER_B2    v27, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q22, [x2]
>> +        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_FILTER_B2    v27, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.ne            1b
>> +2:
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1
>> +        QPEL_UNI_W_V_HEADER
>> +        ldur            w13, [sp, #16]
>> +        mov             x14, x0
>> +        mov             x15, x2
>> +        mov             w11, w4
>> +
>> +3:
>> +        ldr             q16, [x2]
>> +        ldr             q17, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        ldr             q18, [x2]
>> +        ldr             q19, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        ldr             q20, [x2]
>> +        ldr             q21, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        ldr             q22, [x2]
>> +
>> +
>> +1:      ldr             q23, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_FILTER_B2    v27, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q16, [x2]
>> +        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_FILTER_B2    v27, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q17, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_FILTER_B2    v27, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q18, [x2]
>> +        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_FILTER_B2    v27, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q19, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_FILTER_B2    v27, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q20, [x2]
>> +        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_FILTER_B2    v27, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q21, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_FILTER_B2    v27, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q22, [x2]
>> +        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_FILTER_B2    v27, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.ne            1b
>> +2:
>> +        subs            w13, w13, #16
>> +        add             x14, x14, #16
>> +        add             x15, x15, #16
>> +        mov             x0, x14
>> +        mov             x2, x15
>> +        mov             w4, w11
>> +        b.hi            3b
>> +        ret
>> +endfunc
>> +
>> +#if __ARM_FEATURE_DOTPROD
>> +.macro QPEL_UNI_W_H_HEADER
>> +        ldr             x12, [sp]
>> +        sub             x2, x2, #3
>> +        movrel          x9, qpel_filters
>> +        add             x9, x9, x12, lsl 3
>> +        ldr             x11, [x9]
>> +        dup             v28.2d, x11
>> +        mov             w10, #-6
>> +        sub             w10, w10, w5
>> +        dup             v30.4s, w6              // wx
>> +        dup             v31.4s, w10             // shift
>> +        dup             v29.4s, w7              // ox
>> +.endm
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_h4_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_H_HEADER
>> +1:
>> +        ld1             {v0.16b}, [x2], x3
>> +        ext             v1.16b, v0.16b, v0.16b, #1
>> +        ext             v2.16b, v0.16b, v0.16b, #2
>> +        ext             v3.16b, v0.16b, v0.16b, #3
>> +        zip1            v0.2d, v0.2d, v1.2d
>> +        zip1            v2.2d, v2.2d, v3.2d
>> +        movi            v16.2d, #0
>> +        movi            v17.2d, #0
>> +        usdot           v16.4s, v0.16b, v28.16b
>> +        usdot           v17.4s, v2.16b, v28.16b
>> +        addp            v16.4s, v16.4s, v17.4s
>> +        mul             v16.4s, v16.4s, v30.4s
>> +        sqrshl          v16.4s, v16.4s, v31.4s
>> +        sqadd           v16.4s, v16.4s, v29.4s
>> +        sqxtn           v16.4h, v16.4s
>> +        sqxtun          v16.8b, v16.8h
>> +        str             s16, [x0]
>> +        add             x0, x0, x1
>> +        subs            w4, w4, #1
>> +        b.hi            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_h6_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_H_HEADER
>> +        sub             x1, x1, #4
>> +1:
>> +        ld1             {v0.16b}, [x2], x3
>> +        ext             v1.16b, v0.16b, v0.16b, #1
>> +        ext             v2.16b, v0.16b, v0.16b, #2
>> +        ext             v3.16b, v0.16b, v0.16b, #3
>> +        ext             v4.16b, v0.16b, v0.16b, #4
>> +        ext             v5.16b, v0.16b, v0.16b, #5
>> +        zip1            v0.2d, v0.2d, v1.2d
>> +        zip1            v2.2d, v2.2d, v3.2d
>> +        zip1            v4.2d, v4.2d, v5.2d
>> +        movi            v16.2d, #0
>> +        movi            v17.2d, #0
>> +        movi            v18.2d, #0
>> +        usdot           v16.4s, v0.16b, v28.16b
>> +        usdot           v17.4s, v2.16b, v28.16b
>> +        usdot           v18.4s, v4.16b, v28.16b
>> +        addp            v16.4s, v16.4s, v17.4s
>> +        addp            v18.4s, v18.4s, v18.4s
>> +        mul             v16.4s, v16.4s, v30.4s
>> +        mul             v18.2s, v18.2s, v30.2s
>> +        sqrshl          v16.4s, v16.4s, v31.4s
>> +        sqrshl          v18.2s, v18.2s, v31.2s
>> +        sqadd           v16.4s, v16.4s, v29.4s
>> +        sqadd           v18.2s, v18.2s, v29.2s
>> +        sqxtn           v16.4h, v16.4s
>> +        sqxtn2          v16.8h, v18.4s
>> +        sqxtun          v16.8b, v16.8h
>> +        str             s16, [x0], #4
>> +        st1             {v16.h}[2], [x0], x1
>> +        subs            w4, w4, #1
>> +        b.hi            1b
>> +        ret
>> +endfunc
>> +
>> +
>> +.macro  QPEL_UNI_W_H_CALC s0, s1, s2, s3, d0, d1, d2, d3
>> +        movi            \d0\().2d, #0
>> +        movi            \d1\().2d, #0
>> +        movi            \d2\().2d, #0
>> +        movi            \d3\().2d, #0
>> +        usdot           \d0\().4s, \s0\().16b, v28.16b
>> +        usdot           \d1\().4s, \s1\().16b, v28.16b
>> +        usdot           \d2\().4s, \s2\().16b, v28.16b
>> +        usdot           \d3\().4s, \s3\().16b, v28.16b
>> +        addp            \d0\().4s, \d0\().4s, \d1\().4s
>> +        addp            \d2\().4s, \d2\().4s, \d3\().4s
>> +        mul             \d0\().4s, \d0\().4s, v30.4s
>> +        mul             \d2\().4s, \d2\().4s, v30.4s
>> +        sqrshl          \d0\().4s, \d0\().4s, v31.4s
>> +        sqrshl          \d2\().4s, \d2\().4s, v31.4s
>> +        sqadd           \d0\().4s, \d0\().4s, v29.4s
>> +        sqadd           \d2\().4s, \d2\().4s, v29.4s
>> +.endm
>> +
>> +.macro  QPEL_UNI_W_H_CALC_HALF s0, s1, d0, d1
>> +        movi            \d0\().2d, #0
>> +        movi            \d1\().2d, #0
>> +        usdot           \d0\().4s, \s0\().16b, v28.16b
>> +        usdot           \d1\().4s, \s1\().16b, v28.16b
>> +        addp            \d0\().4s, \d0\().4s, \d1\().4s
>> +        mul             \d0\().4s, \d0\().4s, v30.4s
>> +        sqrshl          \d0\().4s, \d0\().4s, v31.4s
>> +        sqadd           \d0\().4s, \d0\().4s, v29.4s
>> +.endm
>> +
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_h8_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_H_HEADER
>> +1:
>> +        ld1             {v16.16b, v17.16b}, [x2], x3
>> +        ext             v1.16b, v16.16b, v17.16b, #1
>> +        ext             v2.16b, v16.16b, v17.16b, #2
>> +        ext             v3.16b, v16.16b, v17.16b, #3
>> +        ext             v4.16b, v16.16b, v17.16b, #4
>> +        ext             v5.16b, v16.16b, v17.16b, #5
>> +        ext             v6.16b, v16.16b, v17.16b, #6
>> +        ext             v7.16b, v16.16b, v17.16b, #7
>> +        zip1            v0.2d, v16.2d, v1.2d
>> +        zip1            v2.2d, v2.2d, v3.2d
>> +        zip1            v4.2d, v4.2d, v5.2d
>> +        zip1            v6.2d, v6.2d, v7.2d
>> +        QPEL_UNI_W_H_CALC  v0, v2, v4, v6,  v18, v19, v20, v21
>> +        sqxtn           v18.4h, v18.4s
>> +        sqxtn2          v18.8h, v20.4s
>> +        sqxtun          v18.8b, v18.8h
>> +        str             d18, [x0]
>> +        add             x0, x0, x1
>> +        subs            w4, w4, #1
>> +        b.hi            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_h12_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_H_HEADER
>> +        add             x13, x0, #8
>> +1:
>> +        ld1             {v16.16b, v17.16b}, [x2], x3
>> +        ext             v1.16b, v16.16b, v17.16b, #1
>> +        ext             v2.16b, v16.16b, v17.16b, #2
>> +        ext             v3.16b, v16.16b, v17.16b, #3
>> +        ext             v4.16b, v16.16b, v17.16b, #4
>> +        ext             v5.16b, v16.16b, v17.16b, #5
>> +        ext             v6.16b, v16.16b, v17.16b, #6
>> +        ext             v7.16b, v16.16b, v17.16b, #7
>> +        zip1            v18.2d, v16.2d, v1.2d
>> +        zip1            v19.2d, v2.2d, v3.2d
>> +        zip1            v20.2d, v4.2d, v5.2d
>> +        zip1            v21.2d, v6.2d, v7.2d
>> +        zip2            v22.2d, v16.2d, v1.2d
>> +        zip2            v23.2d, v2.2d, v3.2d
>> +        QPEL_UNI_W_H_CALC  v18, v19, v20, v21, v0, v2, v4, v6
>> +        QPEL_UNI_W_H_CALC_HALF v22, v23, v24, v25
>> +        sqxtn           v0.4h, v0.4s
>> +        sqxtn2          v0.8h, v4.4s
>> +        sqxtn           v1.4h, v24.4s
>> +        sqxtun          v0.8b, v0.8h
>> +        sqxtun          v1.8b, v1.8h
>> +
>> +        str             d0, [x0]
>> +        str             s1, [x13]
>> +        add             x0, x0, x1
>> +        add             x13, x13, x1
>> +        subs            w4, w4, #1
>> +        b.hi            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_h16_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_H_HEADER
>> +1:
>> +        ld1             {v16.16b, v17.16b}, [x2], x3
>> +        ext             v1.16b, v16.16b, v17.16b, #1
>> +        ext             v2.16b, v16.16b, v17.16b, #2
>> +        ext             v3.16b, v16.16b, v17.16b, #3
>> +        ext             v4.16b, v16.16b, v17.16b, #4
>> +        ext             v5.16b, v16.16b, v17.16b, #5
>> +        ext             v6.16b, v16.16b, v17.16b, #6
>> +        ext             v7.16b, v16.16b, v17.16b, #7
>> +        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v18, v19, v20, v21   //
>> v18: 0, 8, 2, 10 v20: 1, 9, 3, 11
>> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25    //
>> v22: 4, 12, 6, 14 v24: 5, 13, 7, 15
>> +        sqxtn           v0.4h, v18.4s
>> +        sqxtn2          v0.8h, v22.4s
>> +        sqxtn           v1.4h, v20.4s
>> +        sqxtn2          v1.8h, v24.4s
>> +        trn1            v2.8h, v0.8h, v1.8h
>> +        trn2            v3.8h, v0.8h, v1.8h
>> +        sqxtun          v0.8b, v2.8h
>> +        sqxtun2         v0.16b, v3.8h
>> +        st1             {v0.16b}, [x0], x1
>> +        subs            w4, w4, #1
>> +        b.hi            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_h24_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_H_HEADER
>> +        sub             x1, x1, #16
>> +1:
>> +        ld1             {v16.16b, v17.16b}, [x2], x3
>> +        ext             v1.16b, v16.16b, v17.16b, #1
>> +        ext             v2.16b, v16.16b, v17.16b, #2
>> +        ext             v3.16b, v16.16b, v17.16b, #3
>> +        ext             v4.16b, v16.16b, v17.16b, #4
>> +        ext             v5.16b, v16.16b, v17.16b, #5
>> +        ext             v6.16b, v16.16b, v17.16b, #6
>> +        ext             v7.16b, v16.16b, v17.16b, #7
>> +        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v18, v19, v20, v21
>> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25
>> +        sqxtn           v18.4h, v18.4s
>> +        sqxtn2          v18.8h, v22.4s
>> +        sqxtn           v19.4h, v20.4s
>> +        sqxtn2          v19.8h, v24.4s
>> +        trn1            v20.8h, v18.8h, v19.8h
>> +        trn2            v21.8h, v18.8h, v19.8h
>> +        sqxtun          v26.8b, v20.8h
>> +        sqxtun2         v26.16b, v21.8h                         // 0-15
>> +        ext             v1.16b, v17.16b, v17.16b, #1
>> +        ext             v2.16b, v17.16b, v17.16b, #2
>> +        ext             v3.16b, v17.16b, v17.16b, #3
>> +        ext             v4.16b, v17.16b, v17.16b, #4
>> +        ext             v5.16b, v17.16b, v17.16b, #5
>> +        ext             v6.16b, v17.16b, v17.16b, #6
>> +        ext             v7.16b, v17.16b, v17.16b, #7
>> +        zip1            v0.2d, v17.2d, v1.2d
>> +        zip1            v2.2d, v2.2d, v3.2d
>> +        zip1            v4.2d, v4.2d, v5.2d
>> +        zip1            v6.2d, v6.2d, v7.2d
>> +        QPEL_UNI_W_H_CALC  v0, v2, v4, v6, v18, v19, v20, v21
>> +        sqxtn           v18.4h, v18.4s
>> +        sqxtn2          v18.8h, v20.4s
>> +        sqxtun          v27.8b, v18.8h
>> +
>> +        st1             {v26.16b}, [x0], #16
>> +        st1             {v27.8b}, [x0], x1
>> +        subs            w4, w4, #1
>> +        b.hi            1b
>> +        ret
>> +endfunc
>> +
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_h32_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_H_HEADER
>> +1:
>> +        ld1             {v16.16b, v17.16b, v18.16b}, [x2], x3
>> +        ext             v1.16b, v16.16b, v17.16b, #1
>> +        ext             v2.16b, v16.16b, v17.16b, #2
>> +        ext             v3.16b, v16.16b, v17.16b, #3
>> +        ext             v4.16b, v16.16b, v17.16b, #4
>> +        ext             v5.16b, v16.16b, v17.16b, #5
>> +        ext             v6.16b, v16.16b, v17.16b, #6
>> +        ext             v7.16b, v16.16b, v17.16b, #7
>> +        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v0, v19, v20, v21
>> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25
>> +        sqxtn           v0.4h, v0.4s
>> +        sqxtn2          v0.8h, v22.4s
>> +        sqxtn           v19.4h, v20.4s
>> +        sqxtn2          v19.8h, v24.4s
>> +        trn1            v20.8h, v0.8h, v19.8h
>> +        trn2            v21.8h, v0.8h, v19.8h
>> +        sqxtun          v26.8b, v20.8h
>> +        sqxtun2         v26.16b, v21.8h                         // 0-15
>> +        ext             v1.16b, v17.16b, v18.16b, #1
>> +        ext             v2.16b, v17.16b, v18.16b, #2
>> +        ext             v3.16b, v17.16b, v18.16b, #3
>> +        ext             v4.16b, v17.16b, v18.16b, #4
>> +        ext             v5.16b, v17.16b, v18.16b, #5
>> +        ext             v6.16b, v17.16b, v18.16b, #6
>> +        ext             v7.16b, v17.16b, v18.16b, #7
>> +        QPEL_UNI_W_H_CALC  v17, v2, v1, v3, v0, v19, v20, v21
>> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25
>> +        sqxtn           v0.4h, v0.4s
>> +        sqxtn2          v0.8h, v22.4s
>> +        sqxtn           v19.4h, v20.4s
>> +        sqxtn2          v19.8h, v24.4s
>> +        trn1            v20.8h, v0.8h, v19.8h
>> +        trn2            v21.8h, v0.8h, v19.8h
>> +        sqxtun          v27.8b, v20.8h
>> +        sqxtun2         v27.16b, v21.8h                         // 16-31
>> +        st1             {v26.16b, v27.16b}, [x0], x1
>> +        subs            w4, w4, #1
>> +        b.hi            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_h48_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_H_HEADER
>> +1:
>> +        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
>> +        ext             v1.16b, v16.16b, v17.16b, #1
>> +        ext             v2.16b, v16.16b, v17.16b, #2
>> +        ext             v3.16b, v16.16b, v17.16b, #3
>> +        ext             v4.16b, v16.16b, v17.16b, #4
>> +        ext             v5.16b, v16.16b, v17.16b, #5
>> +        ext             v6.16b, v16.16b, v17.16b, #6
>> +        ext             v7.16b, v16.16b, v17.16b, #7
>> +        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v20, v24, v21, v0
>> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
>> +        sqxtn           v20.4h, v20.4s
>> +        sqxtn2          v20.8h, v22.4s
>> +        sqxtn           v21.4h, v21.4s
>> +        sqxtn2          v21.8h, v23.4s
>> +        trn1            v22.8h, v20.8h, v21.8h
>> +        trn2            v23.8h, v20.8h, v21.8h
>> +        sqxtun          v25.8b, v22.8h
>> +        sqxtun2         v25.16b, v23.8h                         // 0-15
>> +        ext             v1.16b, v17.16b, v18.16b, #1
>> +        ext             v2.16b, v17.16b, v18.16b, #2
>> +        ext             v3.16b, v17.16b, v18.16b, #3
>> +        ext             v4.16b, v17.16b, v18.16b, #4
>> +        ext             v5.16b, v17.16b, v18.16b, #5
>> +        ext             v6.16b, v17.16b, v18.16b, #6
>> +        ext             v7.16b, v17.16b, v18.16b, #7
>> +        QPEL_UNI_W_H_CALC  v17, v2, v1, v3, v20, v24, v21, v0
>> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
>> +        sqxtn           v20.4h, v20.4s
>> +        sqxtn2          v20.8h, v22.4s
>> +        sqxtn           v21.4h, v21.4s
>> +        sqxtn2          v21.8h, v23.4s
>> +        trn1            v22.8h, v20.8h, v21.8h
>> +        trn2            v23.8h, v20.8h, v21.8h
>> +        sqxtun          v26.8b, v22.8h
>> +        sqxtun2         v26.16b, v23.8h                         // 16-31
>> +        ext             v1.16b, v18.16b, v19.16b, #1
>> +        ext             v2.16b, v18.16b, v19.16b, #2
>> +        ext             v3.16b, v18.16b, v19.16b, #3
>> +        ext             v4.16b, v18.16b, v19.16b, #4
>> +        ext             v5.16b, v18.16b, v19.16b, #5
>> +        ext             v6.16b, v18.16b, v19.16b, #6
>> +        ext             v7.16b, v18.16b, v19.16b, #7
>> +        QPEL_UNI_W_H_CALC  v18, v2, v1, v3, v20, v24, v21, v0
>> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
>> +        sqxtn           v20.4h, v20.4s
>> +        sqxtn2          v20.8h, v22.4s
>> +        sqxtn           v21.4h, v21.4s
>> +        sqxtn2          v21.8h, v23.4s
>> +        trn1            v22.8h, v20.8h, v21.8h
>> +        trn2            v23.8h, v20.8h, v21.8h
>> +        sqxtun          v27.8b, v22.8h
>> +        sqxtun2         v27.16b, v23.8h                         // 32-47
>> +        st1             {v25.16b, v26.16b, v27.16b}, [x0], x1
>> +        subs            w4, w4, #1
>> +        b.hi            1b
>> +        ret
>> +endfunc
>> +
>> +
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_h64_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_H_HEADER
>> +        sub             x3, x3, #64
>> +1:
>> +        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
>> +        ext             v1.16b, v16.16b, v17.16b, #1
>> +        ext             v2.16b, v16.16b, v17.16b, #2
>> +        ext             v3.16b, v16.16b, v17.16b, #3
>> +        ext             v4.16b, v16.16b, v17.16b, #4
>> +        ext             v5.16b, v16.16b, v17.16b, #5
>> +        ext             v6.16b, v16.16b, v17.16b, #6
>> +        ext             v7.16b, v16.16b, v17.16b, #7
>> +        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v20, v24, v21, v0
>> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
>> +        sqxtn           v20.4h, v20.4s
>> +        sqxtn2          v20.8h, v22.4s
>> +        sqxtn           v21.4h, v21.4s
>> +        sqxtn2          v21.8h, v23.4s
>> +        trn1            v22.8h, v20.8h, v21.8h
>> +        trn2            v23.8h, v20.8h, v21.8h
>> +        sqxtun          v16.8b, v22.8h
>> +        sqxtun2         v16.16b, v23.8h                         // 0-15
>> +        ext             v1.16b, v17.16b, v18.16b, #1
>> +        ext             v2.16b, v17.16b, v18.16b, #2
>> +        ext             v3.16b, v17.16b, v18.16b, #3
>> +        ext             v4.16b, v17.16b, v18.16b, #4
>> +        ext             v5.16b, v17.16b, v18.16b, #5
>> +        ext             v6.16b, v17.16b, v18.16b, #6
>> +        ext             v7.16b, v17.16b, v18.16b, #7
>> +        QPEL_UNI_W_H_CALC  v17, v2, v1, v3, v20, v24, v21, v0
>> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
>> +        sqxtn           v20.4h, v20.4s
>> +        sqxtn2          v20.8h, v22.4s
>> +        sqxtn           v21.4h, v21.4s
>> +        sqxtn2          v21.8h, v23.4s
>> +        trn1            v22.8h, v20.8h, v21.8h
>> +        trn2            v23.8h, v20.8h, v21.8h
>> +        sqxtun          v17.8b, v22.8h
>> +        sqxtun2         v17.16b, v23.8h                         // 16-31
>> +        ext             v1.16b, v18.16b, v19.16b, #1
>> +        ext             v2.16b, v18.16b, v19.16b, #2
>> +        ext             v3.16b, v18.16b, v19.16b, #3
>> +        ext             v4.16b, v18.16b, v19.16b, #4
>> +        ext             v5.16b, v18.16b, v19.16b, #5
>> +        ext             v6.16b, v18.16b, v19.16b, #6
>> +        ext             v7.16b, v18.16b, v19.16b, #7
>> +        QPEL_UNI_W_H_CALC  v18, v2, v1, v3, v20, v24, v21, v0
>> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
>> +        ld1             {v0.16b}, [x2], x3
>> +        sqxtn           v20.4h, v20.4s
>> +        sqxtn2          v20.8h, v22.4s
>> +        sqxtn           v21.4h, v21.4s
>> +        sqxtn2          v21.8h, v23.4s
>> +        trn1            v22.8h, v20.8h, v21.8h
>> +        trn2            v23.8h, v20.8h, v21.8h
>> +        sqxtun          v18.8b, v22.8h
>> +        sqxtun2         v18.16b, v23.8h                         // 32-47
>> +        ext             v1.16b, v19.16b, v0.16b, #1
>> +        ext             v2.16b, v19.16b, v0.16b, #2
>> +        ext             v3.16b, v19.16b, v0.16b, #3
>> +        ext             v4.16b, v19.16b, v0.16b, #4
>> +        ext             v5.16b, v19.16b, v0.16b, #5
>> +        ext             v6.16b, v19.16b, v0.16b, #6
>> +        ext             v7.16b, v19.16b, v0.16b, #7
>> +        QPEL_UNI_W_H_CALC  v19, v2, v1, v3, v20, v24, v21, v0
>> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
>> +        sqxtn           v20.4h, v20.4s
>> +        sqxtn2          v20.8h, v22.4s
>> +        sqxtn           v21.4h, v21.4s
>> +        sqxtn2          v21.8h, v23.4s
>> +        trn1            v22.8h, v20.8h, v21.8h
>> +        trn2            v23.8h, v20.8h, v21.8h
>> +        sqxtun          v19.8b, v22.8h
>> +        sqxtun2         v19.16b, v23.8h                         // 48-63
>> +
>> +        st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
>> +        subs            w4, w4, #1
>> +        b.hi            1b
>> +        ret
>> +endfunc
>> +
>> +
>> +.macro QPEL_H_HEADER
>> +        movrel          x9, qpel_filters
>> +        add             x9, x9, x4, lsl 3
>> +        ldr             x11, [x9]
>> +        dup             v31.2d, x11
>> +        sub             x1, x1, #3
>> +.endm
>> +
>> +function ff_hevc_put_hevc_qpel_h4_8_neon_dotprod, export=1
>> +        QPEL_H_HEADER
>> +        mov             x10, #MAX_PB_SIZE * 2
>> +1:
>> +        ld1             {v0.16b}, [x1], x2
>> +        ext             v1.16b, v0.16b, v0.16b, #1
>> +        ext             v2.16b, v0.16b, v0.16b, #2
>> +        ext             v3.16b, v0.16b, v0.16b, #3
>> +        zip1            v0.2d, v0.2d, v1.2d
>> +        zip1            v2.2d, v2.2d, v3.2d
>> +        movi            v16.2d, #0
>> +        movi            v17.2d, #0
>> +        usdot           v16.4s, v0.16b, v31.16b
>> +        usdot           v17.4s, v2.16b, v31.16b
>> +        addp            v16.4s, v16.4s, v17.4s
>> +        sqxtn           v16.4h, v16.4s
>> +        str             d16, [x0]
>> +        add             x0, x0, x10
>> +        subs            w3, w3, #1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_h6_8_neon_dotprod, export=1
>> +        QPEL_H_HEADER
>> +        mov             x10, #MAX_PB_SIZE * 2
>> +        add             x15, x0, #8
>> +1:
>> +        ld1             {v0.16b}, [x1], x2
>> +        ext             v1.16b, v0.16b, v0.16b, #1
>> +        ext             v2.16b, v0.16b, v0.16b, #2
>> +        ext             v3.16b, v0.16b, v0.16b, #3
>> +        ext             v4.16b, v0.16b, v0.16b, #4
>> +        ext             v5.16b, v0.16b, v0.16b, #5
>> +        zip1            v0.2d, v0.2d, v1.2d
>> +        zip1            v2.2d, v2.2d, v3.2d
>> +        zip1            v4.2d, v4.2d, v5.2d
>> +        movi            v16.2d, #0
>> +        movi            v17.2d, #0
>> +        movi            v18.2d, #0
>> +        usdot           v16.4s, v0.16b, v31.16b
>> +        usdot           v17.4s, v2.16b, v31.16b
>> +        usdot           v18.4s, v4.16b, v31.16b
>> +        addp            v16.4s, v16.4s, v17.4s
>> +        addp            v18.4s, v18.4s, v18.4s
>> +        sqxtn           v16.4h, v16.4s
>> +        sqxtn           v18.4h, v18.4s
>> +        str             d16, [x0]
>> +        str             s18, [x15]
>> +        add             x0, x0, x10
>> +        add             x15, x15, x10
>> +        subs            w3, w3, #1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_h8_8_neon_dotprod, export=1
>> +        QPEL_H_HEADER
>> +        mov             x10, #MAX_PB_SIZE * 2
>> +1:
>> +        ld1             {v0.16b}, [x1], x2
>> +        ext             v1.16b, v0.16b, v0.16b, #1
>> +        ext             v2.16b, v0.16b, v0.16b, #2
>> +        ext             v3.16b, v0.16b, v0.16b, #3
>> +        ext             v4.16b, v0.16b, v0.16b, #4
>> +        ext             v5.16b, v0.16b, v0.16b, #5
>> +        ext             v6.16b, v0.16b, v0.16b, #6
>> +        ext             v7.16b, v0.16b, v0.16b, #7
>> +        zip1            v0.2d, v0.2d, v1.2d
>> +        zip1            v2.2d, v2.2d, v3.2d
>> +        zip1            v4.2d, v4.2d, v5.2d
>> +        zip1            v6.2d, v6.2d, v7.2d
>> +        movi            v16.2d, #0
>> +        movi            v17.2d, #0
>> +        movi            v18.2d, #0
>> +        movi            v19.2d, #0
>> +        usdot           v16.4s, v0.16b, v31.16b
>> +        usdot           v17.4s, v2.16b, v31.16b
>> +        usdot           v18.4s, v4.16b, v31.16b
>> +        usdot           v19.4s, v6.16b, v31.16b
>> +        addp            v16.4s, v16.4s, v17.4s
>> +        addp            v18.4s, v18.4s, v19.4s
>> +        sqxtn           v16.4h, v16.4s
>> +        sqxtn2          v16.8h, v18.4s
>> +        str             q16, [x0]
>> +        add             x0, x0, x10
>> +        subs            w3, w3, #1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +.macro QPEL_H_CALC s0, s1, s2, s3, d0, d1, d2, d3 +        movi
>>      \d0\().2d, #0
>> +        movi            \d1\().2d, #0
>> +        movi            \d2\().2d, #0
>> +        movi            \d3\().2d, #0
>> +        usdot           \d0\().4s, \s0\().16b, v31.16b
>> +        usdot           \d1\().4s, \s1\().16b, v31.16b
>> +        usdot           \d2\().4s, \s2\().16b, v31.16b
>> +        usdot           \d3\().4s, \s3\().16b, v31.16b
>> +.endm
>> +
>> +function ff_hevc_put_hevc_qpel_h12_8_neon_dotprod, export=1
>> +        QPEL_H_HEADER
>> +        mov             x10, #MAX_PB_SIZE * 2
>> +        add             x15, x0, #16
>> +1:
>> +        ld1             {v16.16b, v17.16b}, [x1], x2
>> +        ext             v1.16b, v16.16b, v17.16b, #1
>> +        ext             v2.16b, v16.16b, v17.16b, #2
>> +        ext             v3.16b, v16.16b, v17.16b, #3
>> +        ext             v4.16b, v16.16b, v17.16b, #4
>> +        ext             v5.16b, v16.16b, v17.16b, #5
>> +        ext             v6.16b, v16.16b, v17.16b, #6
>> +        ext             v7.16b, v16.16b, v17.16b, #7
>> +        zip1            v18.2d, v4.2d, v5.2d
>> +        zip1            v19.2d, v6.2d, v7.2d
>> +        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
>> +        addp            v20.4s, v20.4s, v22.4s
>> +        addp            v21.4s, v21.4s, v23.4s
>> +        movi            v24.2d, #0
>> +        movi            v25.2d, #0
>> +        usdot           v24.4s, v18.16b, v31.16b
>> +        usdot           v25.4s, v19.16b, v31.16b
>> +        addp            v24.4s, v24.4s, v25.4s
>> +        trn1            v26.4s, v20.4s, v21.4s
>> +        trn2            v27.4s, v20.4s, v21.4s
>> +        sqxtn           v26.4h, v26.4s
>> +        sqxtn           v27.4h, v27.4s
>> +        sqxtn2          v26.8h, v24.4s
>> +
>> +        str             q26, [x0]
>> +        str             d27, [x15]
>> +        add             x0, x0, x10
>> +        add             x15, x15, x10
>> +        subs            w3, w3, #1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_h16_8_neon_dotprod, export=1
>> +        QPEL_H_HEADER
>> +        mov             x10, #MAX_PB_SIZE * 2
>> +1:
>> +        ld1             {v16.16b, v17.16b}, [x1], x2
>> +        ext             v1.16b, v16.16b, v17.16b, #1
>> +        ext             v2.16b, v16.16b, v17.16b, #2
>> +        ext             v3.16b, v16.16b, v17.16b, #3
>> +        ext             v4.16b, v16.16b, v17.16b, #4
>> +        ext             v5.16b, v16.16b, v17.16b, #5
>> +        ext             v6.16b, v16.16b, v17.16b, #6
>> +        ext             v7.16b, v16.16b, v17.16b, #7
>> +
>> +        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
>> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
>> +
>> +        addp            v20.4s, v20.4s, v22.4s
>> +        addp            v21.4s, v21.4s, v23.4s
>> +        addp            v24.4s, v24.4s, v26.4s
>> +        addp            v25.4s, v25.4s, v27.4s
>> +
>> +        trn1            v22.4s, v20.4s, v21.4s
>> +        trn2            v23.4s, v20.4s, v21.4s
>> +        trn1            v26.4s, v24.4s, v25.4s
>> +        trn2            v27.4s, v24.4s, v25.4s
>> +
>> +        sqxtn           v18.4h, v22.4s
>> +        sqxtn2          v18.8h, v26.4s
>> +        sqxtn           v19.4h, v23.4s
>> +        sqxtn2          v19.8h, v27.4s
>> +        +        stp             q18, q19, [x0]
>> +        add             x0, x0, x10
>> +        subs            w3, w3, #1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_h24_8_neon_dotprod, export=1
>> +        QPEL_H_HEADER
>> +        mov             x10, #MAX_PB_SIZE * 2
>> +        add             x15, x0, #32
>> +1:
>> +        ld1             {v16.16b, v17.16b}, [x1], x2
>> +        ext             v1.16b, v16.16b, v17.16b, #1
>> +        ext             v2.16b, v16.16b, v17.16b, #2
>> +        ext             v3.16b, v16.16b, v17.16b, #3
>> +        ext             v4.16b, v16.16b, v17.16b, #4
>> +        ext             v5.16b, v16.16b, v17.16b, #5
>> +        ext             v6.16b, v16.16b, v17.16b, #6
>> +        ext             v7.16b, v16.16b, v17.16b, #7
>> +        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
>> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
>> +        addp            v20.4s, v20.4s, v22.4s
>> +        addp            v21.4s, v21.4s, v23.4s
>> +        addp            v24.4s, v24.4s, v26.4s
>> +        addp            v25.4s, v25.4s, v27.4s
>> +        trn1            v22.4s, v20.4s, v21.4s
>> +        trn2            v23.4s, v20.4s, v21.4s
>> +        trn1            v26.4s, v24.4s, v25.4s
>> +        trn2            v27.4s, v24.4s, v25.4s
>> +        sqxtn           v18.4h, v22.4s
>> +        sqxtn2          v18.8h, v26.4s
>> +        sqxtn           v19.4h, v23.4s
>> +        sqxtn2          v19.8h, v27.4s
>> +        stp             q18, q19, [x0]
>> +        add             x0, x0, x10
>> +        ext             v1.16b, v17.16b, v17.16b, #1
>> +        ext             v2.16b, v17.16b, v17.16b, #2
>> +        ext             v3.16b, v17.16b, v17.16b, #3
>> +        ext             v4.16b, v17.16b, v17.16b, #4
>> +        ext             v5.16b, v17.16b, v17.16b, #5
>> +        ext             v6.16b, v17.16b, v17.16b, #6
>> +        ext             v7.16b, v17.16b, v17.16b, #7
>> +        zip1            v0.2d, v17.2d, v1.2d
>> +        zip1            v2.2d, v2.2d, v3.2d
>> +        zip1            v4.2d, v4.2d, v5.2d
>> +        zip1            v6.2d, v6.2d, v7.2d
>> +        QPEL_H_CALC     v0, v2, v4, v5, v20, v21, v22, v23
>> +        addp            v20.4s, v20.4s, v21.4s
>> +        addp            v22.4s, v22.4s, v23.4s
>> +        sqxtn           v20.4h, v20.4s
>> +        sqxtn2          v20.8h, v22.4s
>> +        str             q20, [x15]
>> +        add             x15, x15, x10
>> +        subs            w3, w3, #1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_h32_8_neon_dotprod, export=1
>> +        QPEL_H_HEADER
>> +        mov             x10, #MAX_PB_SIZE * 2
>> +        add             x15, x0, #32
>> +1:
>> +        ld1             {v16.16b, v17.16b, v18.16b}, [x1], x2
>> +        ext             v1.16b, v16.16b, v17.16b, #1
>> +        ext             v2.16b, v16.16b, v17.16b, #2
>> +        ext             v3.16b, v16.16b, v17.16b, #3
>> +        ext             v4.16b, v16.16b, v17.16b, #4
>> +        ext             v5.16b, v16.16b, v17.16b, #5
>> +        ext             v6.16b, v16.16b, v17.16b, #6
>> +        ext             v7.16b, v16.16b, v17.16b, #7
>> +        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
>> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
>> +        addp            v20.4s, v20.4s, v22.4s
>> +        addp            v21.4s, v21.4s, v23.4s
>> +        addp            v24.4s, v24.4s, v26.4s
>> +        addp            v25.4s, v25.4s, v27.4s
>> +        trn1            v22.4s, v20.4s, v21.4s
>> +        trn2            v23.4s, v20.4s, v21.4s
>> +        trn1            v26.4s, v24.4s, v25.4s
>> +        trn2            v27.4s, v24.4s, v25.4s
>> +        sqxtn           v20.4h, v22.4s
>> +        sqxtn2          v20.8h, v26.4s
>> +        sqxtn           v21.4h, v23.4s
>> +        sqxtn2          v21.8h, v27.4s
>> +        stp             q20, q21, [x0]
>> +        add             x0, x0, x10
>> +        ext             v1.16b, v17.16b, v18.16b, #1
>> +        ext             v2.16b, v17.16b, v18.16b, #2
>> +        ext             v3.16b, v17.16b, v18.16b, #3
>> +        ext             v4.16b, v17.16b, v18.16b, #4
>> +        ext             v5.16b, v17.16b, v18.16b, #5
>> +        ext             v6.16b, v17.16b, v18.16b, #6
>> +        ext             v7.16b, v17.16b, v18.16b, #7
>> +        QPEL_H_CALC     v17, v1, v2, v3, v20, v21, v22, v23
>> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
>> +        addp            v20.4s, v20.4s, v22.4s
>> +        addp            v21.4s, v21.4s, v23.4s
>> +        addp            v24.4s, v24.4s, v26.4s
>> +        addp            v25.4s, v25.4s, v27.4s
>> +        trn1            v22.4s, v20.4s, v21.4s
>> +        trn2            v23.4s, v20.4s, v21.4s
>> +        trn1            v26.4s, v24.4s, v25.4s
>> +        trn2            v27.4s, v24.4s, v25.4s
>> +        sqxtn           v20.4h, v22.4s
>> +        sqxtn2          v20.8h, v26.4s
>> +        sqxtn           v21.4h, v23.4s
>> +        sqxtn2          v21.8h, v27.4s
>> +        stp             q20, q21, [x15]
>> +        add             x15, x15, x10
>> +        subs            w3, w3, #1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_h48_8_neon_dotprod, export=1
>> +        QPEL_H_HEADER
>> +        mov             x10, #MAX_PB_SIZE * 2 - 64
>> +1:
>> +        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2
>> +        ext             v1.16b, v16.16b, v17.16b, #1
>> +        ext             v2.16b, v16.16b, v17.16b, #2
>> +        ext             v3.16b, v16.16b, v17.16b, #3
>> +        ext             v4.16b, v16.16b, v17.16b, #4
>> +        ext             v5.16b, v16.16b, v17.16b, #5
>> +        ext             v6.16b, v16.16b, v17.16b, #6
>> +        ext             v7.16b, v16.16b, v17.16b, #7
>> +        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
>> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
>> +        addp            v20.4s, v20.4s, v22.4s
>> +        addp            v21.4s, v21.4s, v23.4s
>> +        addp            v24.4s, v24.4s, v26.4s
>> +        addp            v25.4s, v25.4s, v27.4s
>> +        trn1            v22.4s, v20.4s, v21.4s
>> +        trn2            v23.4s, v20.4s, v21.4s
>> +        trn1            v26.4s, v24.4s, v25.4s
>> +        trn2            v27.4s, v24.4s, v25.4s
>> +        sqxtn           v20.4h, v22.4s
>> +        sqxtn2          v20.8h, v26.4s
>> +        sqxtn           v21.4h, v23.4s
>> +        sqxtn2          v21.8h, v27.4s
>> +        stp             q20, q21, [x0], #32
>> +
>> +        ext             v1.16b, v17.16b, v18.16b, #1
>> +        ext             v2.16b, v17.16b, v18.16b, #2
>> +        ext             v3.16b, v17.16b, v18.16b, #3
>> +        ext             v4.16b, v17.16b, v18.16b, #4
>> +        ext             v5.16b, v17.16b, v18.16b, #5
>> +        ext             v6.16b, v17.16b, v18.16b, #6
>> +        ext             v7.16b, v17.16b, v18.16b, #7
>> +        QPEL_H_CALC     v17, v1, v2, v3, v20, v21, v22, v23
>> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
>> +        addp            v20.4s, v20.4s, v22.4s
>> +        addp            v21.4s, v21.4s, v23.4s
>> +        addp            v24.4s, v24.4s, v26.4s
>> +        addp            v25.4s, v25.4s, v27.4s
>> +        trn1            v22.4s, v20.4s, v21.4s
>> +        trn2            v23.4s, v20.4s, v21.4s
>> +        trn1            v26.4s, v24.4s, v25.4s
>> +        trn2            v27.4s, v24.4s, v25.4s
>> +        sqxtn           v20.4h, v22.4s
>> +        sqxtn2          v20.8h, v26.4s
>> +        sqxtn           v21.4h, v23.4s
>> +        sqxtn2          v21.8h, v27.4s
>> +        stp             q20, q21, [x0], #32
>> +        ext             v1.16b, v18.16b, v19.16b, #1
>> +        ext             v2.16b, v18.16b, v19.16b, #2
>> +        ext             v3.16b, v18.16b, v19.16b, #3
>> +        ext             v4.16b, v18.16b, v19.16b, #4
>> +        ext             v5.16b, v18.16b, v19.16b, #5
>> +        ext             v6.16b, v18.16b, v19.16b, #6
>> +        ext             v7.16b, v18.16b, v19.16b, #7
>> +        QPEL_H_CALC     v18, v1, v2, v3, v20, v21, v22, v23
>> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
>> +        addp            v20.4s, v20.4s, v22.4s
>> +        addp            v21.4s, v21.4s, v23.4s
>> +        addp            v24.4s, v24.4s, v26.4s
>> +        addp            v25.4s, v25.4s, v27.4s
>> +        trn1            v22.4s, v20.4s, v21.4s
>> +        trn2            v23.4s, v20.4s, v21.4s
>> +        trn1            v26.4s, v24.4s, v25.4s
>> +        trn2            v27.4s, v24.4s, v25.4s
>> +        sqxtn           v20.4h, v22.4s
>> +        sqxtn2          v20.8h, v26.4s
>> +        sqxtn           v21.4h, v23.4s
>> +        sqxtn2          v21.8h, v27.4s
>> +        stp             q20, q21, [x0]
>> +        add             x0, x0, x10
>> +        subs            w3, w3, #1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_h64_8_neon_dotprod, export=1
>> +        QPEL_H_HEADER
>> +        sub             x2, x2, #64
>> +1:
>> +        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64
>> +        ext             v1.16b, v16.16b, v17.16b, #1
>> +        ext             v2.16b, v16.16b, v17.16b, #2
>> +        ext             v3.16b, v16.16b, v17.16b, #3
>> +        ext             v4.16b, v16.16b, v17.16b, #4
>> +        ext             v5.16b, v16.16b, v17.16b, #5
>> +        ext             v6.16b, v16.16b, v17.16b, #6
>> +        ext             v7.16b, v16.16b, v17.16b, #7
>> +        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
>> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
>> +        addp            v20.4s, v20.4s, v22.4s
>> +        addp            v21.4s, v21.4s, v23.4s
>> +        addp            v24.4s, v24.4s, v26.4s
>> +        addp            v25.4s, v25.4s, v27.4s
>> +        trn1            v22.4s, v20.4s, v21.4s
>> +        trn2            v23.4s, v20.4s, v21.4s
>> +        trn1            v26.4s, v24.4s, v25.4s
>> +        trn2            v27.4s, v24.4s, v25.4s
>> +        sqxtn           v20.4h, v22.4s
>> +        sqxtn2          v20.8h, v26.4s
>> +        sqxtn           v21.4h, v23.4s
>> +        sqxtn2          v21.8h, v27.4s
>> +        stp             q20, q21, [x0], #32
>> +
>> +        ext             v1.16b, v17.16b, v18.16b, #1
>> +        ext             v2.16b, v17.16b, v18.16b, #2
>> +        ext             v3.16b, v17.16b, v18.16b, #3
>> +        ext             v4.16b, v17.16b, v18.16b, #4
>> +        ext             v5.16b, v17.16b, v18.16b, #5
>> +        ext             v6.16b, v17.16b, v18.16b, #6
>> +        ext             v7.16b, v17.16b, v18.16b, #7
>> +        QPEL_H_CALC     v17, v1, v2, v3, v20, v21, v22, v23
>> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
>> +        addp            v20.4s, v20.4s, v22.4s
>> +        addp            v21.4s, v21.4s, v23.4s
>> +        addp            v24.4s, v24.4s, v26.4s
>> +        addp            v25.4s, v25.4s, v27.4s
>> +        trn1            v22.4s, v20.4s, v21.4s
>> +        trn2            v23.4s, v20.4s, v21.4s
>> +        trn1            v26.4s, v24.4s, v25.4s
>> +        trn2            v27.4s, v24.4s, v25.4s
>> +        sqxtn           v20.4h, v22.4s
>> +        sqxtn2          v20.8h, v26.4s
>> +        sqxtn           v21.4h, v23.4s
>> +        sqxtn2          v21.8h, v27.4s
>> +        stp             q20, q21, [x0], #32
>> +        ext             v1.16b, v18.16b, v19.16b, #1
>> +        ext             v2.16b, v18.16b, v19.16b, #2
>> +        ext             v3.16b, v18.16b, v19.16b, #3
>> +        ext             v4.16b, v18.16b, v19.16b, #4
>> +        ext             v5.16b, v18.16b, v19.16b, #5
>> +        ext             v6.16b, v18.16b, v19.16b, #6
>> +        ext             v7.16b, v18.16b, v19.16b, #7
>> +        QPEL_H_CALC     v18, v1, v2, v3, v20, v21, v22, v23
>> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
>> +        addp            v20.4s, v20.4s, v22.4s
>> +        addp            v21.4s, v21.4s, v23.4s
>> +        addp            v24.4s, v24.4s, v26.4s
>> +        addp            v25.4s, v25.4s, v27.4s
>> +        trn1            v22.4s, v20.4s, v21.4s
>> +        trn2            v23.4s, v20.4s, v21.4s
>> +        trn1            v26.4s, v24.4s, v25.4s
>> +        trn2            v27.4s, v24.4s, v25.4s
>> +        sqxtn           v20.4h, v22.4s
>> +        sqxtn2          v20.8h, v26.4s
>> +        sqxtn           v21.4h, v23.4s
>> +        sqxtn2          v21.8h, v27.4s
>> +        stp             q20, q21, [x0], #32
>> +        ld1             {v28.8b}, [x1], x2
>> +        ext             v1.16b, v19.16b, v28.16b, #1
>> +        ext             v2.16b, v19.16b, v28.16b, #2
>> +        ext             v3.16b, v19.16b, v28.16b, #3
>> +        ext             v4.16b, v19.16b, v28.16b, #4
>> +        ext             v5.16b, v19.16b, v28.16b, #5
>> +        ext             v6.16b, v19.16b, v28.16b, #6
>> +        ext             v7.16b, v19.16b, v28.16b, #7
>> +        QPEL_H_CALC     v19, v1, v2, v3, v20, v21, v22, v23
>> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
>> +        addp            v20.4s, v20.4s, v22.4s
>> +        addp            v21.4s, v21.4s, v23.4s
>> +        addp            v24.4s, v24.4s, v26.4s
>> +        addp            v25.4s, v25.4s, v27.4s
>> +        trn1            v22.4s, v20.4s, v21.4s
>> +        trn2            v23.4s, v20.4s, v21.4s
>> +        trn1            v26.4s, v24.4s, v25.4s
>> +        trn2            v27.4s, v24.4s, v25.4s
>> +        sqxtn           v20.4h, v22.4s
>> +        sqxtn2          v20.8h, v26.4s
>> +        sqxtn           v21.4h, v23.4s
>> +        sqxtn2          v21.8h, v27.4s
>> +        stp             q20, q21, [x0], #32
>> +        subs            w3, w3, #1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +.macro QPEL_UNI_W_HV_HEADER width
>> +        ldp             x14, x15, [sp]          // mx, my
>> +        ldr             w13, [sp, #16]          // width
>> +        stp             x20, x21, [sp, #-16]!
>> +        stp             x22, x23, [sp, #-16]!
>> +        stp             x24, x25, [sp, #-16]!
>> +        stp             x26, x27, [sp, #-16]!
>> +        stp             x28, x30, [sp, #-16]!
>> +        mov             x28, sp
>> +        mov             x11, #9088
>> +        sub             sp, sp, x11
>> +        mov             x20, x0
>> +        mov             x21, x1
>> +        mov             x0, sp
>> +        sub             x1, x2, x3, lsl 1
>> +        sub             x1, x1, x3
>> +        mov             x2, x3
>> +        add             w3, w4, #7
>> +        mov             w22, w4                 // height
>> +        mov             x4, x14                 // mx
>> +        mov             x23, x15                // my
>> +        mov             w24, w6                 // wx
>> +        mov             w25, w7                 // ox
>> +        mov             w26, #-6
>> +        sub             w26, w26, w5            // -shift
>> +        mov             w27, w13                // width
>> +        bl              X(ff_hevc_put_hevc_qpel_h\width\()_8_neon_dotprod)
>> +        movrel          x9, qpel_filters
>> +        add             x9, x9, x23, lsl 3
>> +        ld1             {v0.8b}, [x9]
>> +        sxtl            v0.8h, v0.8b
>> +        mov             x10, #(MAX_PB_SIZE * 2)
>> +        dup             v28.4s, w24
>> +        dup             v29.4s, w25
>> +        dup             v30.4s, w26
>> +.endm
>> +
>> +.macro QPEL_UNI_W_HV_END
>> +        mov             sp, x28
>> +        ldp             x28, x30, [sp], #16
>> +        ldp             x26, x27, [sp], #16
>> +        ldp             x24, x25, [sp], #16
>> +        ldp             x22, x23, [sp], #16
>> +        ldp             x20, x21, [sp], #16
>> +.endm
>> +
>> +.macro QPEL_UNI_W_HV_4
>> +        sshr            v26.4s, v26.4s, #6
>> +        mul             v24.4s, v26.4s, v28.4s
>> +        sqrshl          v24.4s, v24.4s, v30.4s
>> +        sqadd           v24.4s, v24.4s, v29.4s
>> +        sqxtn           v24.4h, v24.4s
>> +        sqxtun          v24.8b, v24.8h
>> +        st1             {v24.s}[0], [x20], x21
>> +.endm
>> +
>> +.macro QPEL_FILTER_H    dst, src0, src1, src2, src3, src4, src5, src6, src7
>> +        smull           \dst\().4s, \src0\().4h, v0.h[0]
>> +        smlal           \dst\().4s, \src1\().4h, v0.h[1]
>> +        smlal           \dst\().4s, \src2\().4h, v0.h[2]
>> +        smlal           \dst\().4s, \src3\().4h, v0.h[3]
>> +        smlal           \dst\().4s, \src4\().4h, v0.h[4]
>> +        smlal           \dst\().4s, \src5\().4h, v0.h[5]
>> +        smlal           \dst\().4s, \src6\().4h, v0.h[6]
>> +        smlal           \dst\().4s, \src7\().4h, v0.h[7]
>> +.endm
>> +
>> +.macro QPEL_FILTER_H2    dst, src0, src1, src2, src3, src4, src5, src6,
>> src7
>> +        smull2          \dst\().4s, \src0\().8h, v0.h[0]
>> +        smlal2          \dst\().4s, \src1\().8h, v0.h[1]
>> +        smlal2          \dst\().4s, \src2\().8h, v0.h[2]
>> +        smlal2          \dst\().4s, \src3\().8h, v0.h[3]
>> +        smlal2          \dst\().4s, \src4\().8h, v0.h[4]
>> +        smlal2          \dst\().4s, \src5\().8h, v0.h[5]
>> +        smlal2          \dst\().4s, \src6\().8h, v0.h[6]
>> +        smlal2          \dst\().4s, \src7\().8h, v0.h[7]
>> +.endm
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_hv4_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_HV_HEADER 4
>> +        ldr             d16, [sp]
>> +        ldr             d17, [sp, x10]
>> +        add             sp, sp, x10, lsl 1
>> +        ldr             d18, [sp]
>> +        ldr             d19, [sp, x10]
>> +        add             sp, sp, x10, lsl 1
>> +        ldr             d20, [sp]
>> +        ldr             d21, [sp, x10]
>> +        add             sp, sp, x10, lsl 1
>> +        ldr             d22, [sp]
>> +        add             sp, sp, x10
>> +1:
>> +        ldr             d23, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_UNI_W_HV_4
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             d16, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_UNI_W_HV_4
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             d17, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_UNI_W_HV_4
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             d18, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_UNI_W_HV_4
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             d19, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_UNI_W_HV_4
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             d20, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_UNI_W_HV_4
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             d21, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_UNI_W_HV_4
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             d22, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_UNI_W_HV_4
>> +        subs            w22, w22, #1
>> +        b.hi            1b
>> +
>> +2:
>> +        QPEL_UNI_W_HV_END
>> +        ret
>> +endfunc
>> +
>> +.macro QPEL_UNI_W_HV_8
>> +        sshr            v26.4s, v26.4s, #6
>> +        sshr            v27.4s, v27.4s, #6
>> +        mul             v24.4s, v26.4s, v28.4s
>> +        mul             v25.4s, v27.4s, v28.4s
>> +        sqrshl          v24.4s, v24.4s, v30.4s
>> +        sqrshl          v25.4s, v25.4s, v30.4s
>> +        sqadd           v24.4s, v24.4s, v29.4s
>> +        sqadd           v25.4s, v25.4s, v29.4s
>> +        sqxtn           v24.4h, v24.4s
>> +        sqxtn2          v24.8h, v25.4s
>> +        sqxtun          v24.8b, v24.8h
>> +        st1             {v24.d}[0], [x20], x21
>> +.endm
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_hv8_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_HV_HEADER 8
>> +        ldr             q16, [sp]
>> +        ldr             q17, [sp, x10]
>> +        add             sp, sp, x10, lsl 1
>> +        ldr             q18, [sp]
>> +        ldr             q19, [sp, x10]
>> +        add             sp, sp, x10, lsl 1
>> +        ldr             q20, [sp]
>> +        ldr             q21, [sp, x10]
>> +        add             sp, sp, x10, lsl 1
>> +        ldr             q22, [sp]
>> +        add             sp, sp, x10
>> +1:
>> +        ldr             q23, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_FILTER_H2  v27, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_UNI_W_HV_8
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             q16, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_FILTER_H2  v27, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_UNI_W_HV_8
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             q17, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_FILTER_H2  v27, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_UNI_W_HV_8
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             q18, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_FILTER_H2  v27, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_UNI_W_HV_8
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             q19, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_FILTER_H2  v27, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_UNI_W_HV_8
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             q20, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_FILTER_H2  v27, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_UNI_W_HV_8
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             q21, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_FILTER_H2  v27, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_UNI_W_HV_8
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             q22, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_FILTER_H2  v27, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_UNI_W_HV_8
>> +        subs            w22, w22, #1
>> +        b.hi            1b
>> +
>> +2:
>> +        QPEL_UNI_W_HV_END
>> +        ret
>> +endfunc
>> +
>> +.macro QPEL_UNI_W_HV_16
>> +        sshr            v24.4s, v24.4s, #6
>> +        sshr            v25.4s, v25.4s, #6
>> +        sshr            v26.4s, v26.4s, #6
>> +        sshr            v27.4s, v27.4s, #6
>> +        mul             v24.4s, v24.4s, v28.4s
>> +        mul             v25.4s, v25.4s, v28.4s
>> +        mul             v26.4s, v26.4s, v28.4s
>> +        mul             v27.4s, v27.4s, v28.4s
>> +        sqrshl          v24.4s, v24.4s, v30.4s
>> +        sqrshl          v25.4s, v25.4s, v30.4s
>> +        sqrshl          v26.4s, v26.4s, v30.4s
>> +        sqrshl          v27.4s, v27.4s, v30.4s
>> +        sqadd           v24.4s, v24.4s, v29.4s
>> +        sqadd           v25.4s, v25.4s, v29.4s
>> +        sqadd           v26.4s, v26.4s, v29.4s
>> +        sqadd           v27.4s, v27.4s, v29.4s
>> +        sqxtn           v24.4h, v24.4s
>> +        sqxtn2          v24.8h, v25.4s
>> +        sqxtn           v26.4h, v26.4s
>> +        sqxtn2          v26.8h, v27.4s
>> +        sqxtun          v24.8b, v24.8h
>> +        sqxtun2         v24.16b, v26.8h
>> +
>> +        st1             {v24.16b}, [x20], x21
>> +.endm
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_hv16_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_HV_HEADER 16
>> +        ldp             q16, q1, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q17, q2, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q18, q3, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q19, q4, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q20, q5, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q21, q6, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q22, q7, [sp]
>> +        add             sp, sp, x10
>> +1:
>> +        ldp             q23, q31, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_FILTER_H2  v25, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_FILTER_H   v26,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
>> +        QPEL_FILTER_H2  v27,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q16, q1, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_FILTER_H2  v25, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_FILTER_H   v26,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
>> +        QPEL_FILTER_H2  v27,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q17, q2, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_FILTER_H2  v25, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_FILTER_H   v26,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
>> +        QPEL_FILTER_H2  v27,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q18, q3, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_FILTER_H2  v25, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_FILTER_H   v26,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
>> +        QPEL_FILTER_H2  v27,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q19, q4, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_FILTER_H2  v25, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_FILTER_H   v26,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
>> +        QPEL_FILTER_H2  v27,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q20, q5, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_FILTER_H2  v25, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_FILTER_H   v26,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
>> +        QPEL_FILTER_H2  v27,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q21, q6, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_FILTER_H2  v25, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_FILTER_H   v26,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
>> +        QPEL_FILTER_H2  v27,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q22, q7, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_FILTER_H2  v25, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_FILTER_H   v26, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
>> +        QPEL_FILTER_H2  v27, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.hi            1b
>> +
>> +2:
>> +        QPEL_UNI_W_HV_END
>> +        ret
>> +endfunc
>> +
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_HV_HEADER 32
>> +        mov             x11, sp
>> +        mov             w12, w22
>> +        mov             x13, x20
>> +3:
>> +        ldp             q16, q1, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q17, q2, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q18, q3, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q19, q4, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q20, q5, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q21, q6, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q22, q7, [sp]
>> +        add             sp, sp, x10
>> +1:
>> +        ldp             q23, q31, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_FILTER_H2  v25, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_FILTER_H   v26,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
>> +        QPEL_FILTER_H2  v27,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q16, q1, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_FILTER_H2  v25, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_FILTER_H   v26,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
>> +        QPEL_FILTER_H2  v27,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q17, q2, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_FILTER_H2  v25, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_FILTER_H   v26,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
>> +        QPEL_FILTER_H2  v27,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q18, q3, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_FILTER_H2  v25, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_FILTER_H   v26,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
>> +        QPEL_FILTER_H2  v27,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q19, q4, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_FILTER_H2  v25, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_FILTER_H   v26,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
>> +        QPEL_FILTER_H2  v27,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q20, q5, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_FILTER_H2  v25, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_FILTER_H   v26,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
>> +        QPEL_FILTER_H2  v27,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q21, q6, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_FILTER_H2  v25, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_FILTER_H   v26,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
>> +        QPEL_FILTER_H2  v27,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q22, q7, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_FILTER_H2  v25, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_FILTER_H   v26, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
>> +        QPEL_FILTER_H2  v27, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.hi            1b
>> +2:
>> +        subs            w27, w27, #16
>> +        add             sp, x11, #32
>> +        add             x20, x13, #16
>> +        mov             w22, w12
>> +        mov             x11, sp
>> +        mov             x13, x20
>> +        b.hi            3b
>> +        QPEL_UNI_W_HV_END
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_HV_HEADER 64
>> +        mov             x11, sp
>> +        mov             w12, w22
>> +        mov             x13, x20
>> +3:
>> +        ldp             q16, q1, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q17, q2, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q18, q3, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q19, q4, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q20, q5, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q21, q6, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q22, q7, [sp]
>> +        add             sp, sp, x10
>> +1:
>> +        ldp             q23, q31, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_FILTER_H2  v25, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_FILTER_H   v26,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
>> +        QPEL_FILTER_H2  v27,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q16, q1, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_FILTER_H2  v25, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_FILTER_H   v26,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
>> +        QPEL_FILTER_H2  v27,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q17, q2, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_FILTER_H2  v25, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_FILTER_H   v26,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
>> +        QPEL_FILTER_H2  v27,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q18, q3, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_FILTER_H2  v25, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_FILTER_H   v26,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
>> +        QPEL_FILTER_H2  v27,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q19, q4, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_FILTER_H2  v25, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_FILTER_H   v26,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
>> +        QPEL_FILTER_H2  v27,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q20, q5, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_FILTER_H2  v25, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_FILTER_H   v26,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
>> +        QPEL_FILTER_H2  v27,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q21, q6, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_FILTER_H2  v25, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_FILTER_H   v26,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
>> +        QPEL_FILTER_H2  v27,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q22, q7, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_FILTER_H2  v25, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_FILTER_H   v26, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
>> +        QPEL_FILTER_H2  v27, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.hi            1b
>> +2:
>> +        subs            w27, w27, #16
>> +        add             sp, x11, #32
>> +        add             x20, x13, #16
>> +        mov             w22, w12
>> +        mov             x11, sp
>> +        mov             x13, x20
>> +        b.hi            3b
>> +        QPEL_UNI_W_HV_END
>> +        ret
>> +endfunc
>> +
>> +#endif // __ARM_FEATURE_DOTPROD
>> \ No newline at end of file
>> -- 
>> 2.38.0.windows.1
>>
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
From 10924c4552031b9a35b514cdf11d48e122e0326a Mon Sep 17 00:00:00 2001
From: myais <Logan.Lyu@myais.com.cn>
Date: Wed, 3 May 2023 09:53:07 +0800
Subject: [PATCH 1/3] lavc/aarch64: new optimization for 8-bit
 hevc_pel_uni_w_pixels and qpel_uni_w_v

Signed-off-by: myais <Logan.Lyu@myais.com.cn>
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  55 ++
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 710 ++++++++++++++++++++++
 2 files changed, 765 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index be1049a2ec..fd96819b5e 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -128,6 +128,57 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co
                                          ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
                                          mx, intptr_t my, int width);
 
+#define NEON8_FNPROTO(fn, args, ext) \
+    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##6_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##12_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##24_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##48_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+
+#define NEON8_FNPROTO_PARTIAL_4(fn, args, ext) \
+    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+
+NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox, 
+        intptr_t mx, intptr_t my, int width),);
+
+NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width),);
+
+#endif
+
+#define NEON8_FNASSIGN(member, v, h, fn, ext) \
+        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
+        member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext;  \
+        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[4][v][h] = ff_hevc_put_hevc_##fn##12_8_neon##ext; \
+        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[6][v][h] = ff_hevc_put_hevc_##fn##24_8_neon##ext; \
+        member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
+        member[8][v][h] = ff_hevc_put_hevc_##fn##48_8_neon##ext; \
+        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
+
+#define NEON8_FNASSIGN_PARTIAL_4(member, v, h, fn, ext) \
+        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
+        member[2][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[4][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[6][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
+        member[7][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
+        member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
+        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
+
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
     if (!have_neon(av_get_cpu_flags())) return;
@@ -185,6 +236,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->put_hevc_qpel_bi[7][0][1]   =
         c->put_hevc_qpel_bi[8][0][1]   =
         c->put_hevc_qpel_bi[9][0][1]   = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
+
+        NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
+        NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
+
     }
     if (bit_depth == 10) {
         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 0e7b912678..9e83bc0e01 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -30,6 +30,13 @@ const qpel_filters, align=4
         .byte           0,  1, -5, 17, 58,-10, 4, -1
 endconst
 
+const qpel_filters_abs, align=4
+        .byte           0,  0,  0,  0,  0,  0, 0,  0
+        .byte           1,  4, 10, 58, 17,  5, 1,  0
+        .byte           1,  4, 11, 40, 40, 11, 4,  1
+        .byte           0,  1,  5, 17, 58, 10, 4,  1
+endconst
+
 .macro load_filter m
         movrel          x15, qpel_filters
         add             x15, x15, \m, lsl #3
@@ -482,3 +489,706 @@ endfunc
 put_hevc qpel
 put_hevc qpel_uni
 put_hevc qpel_bi
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.8h, w7
+1:
+        ldr     s0, [x2]
+        ldr     s1, [x2, x3]
+        add     x2, x2, x3, lsl 1
+        ushll   v0.8h, v0.8b, #6
+        ushll   v1.8h, v1.8b, #6
+        smull   v0.4s, v0.4h, v30.4h
+        smull   v1.4s, v1.4h, v30.4h
+        sqrshl  v0.4s, v0.4s, v31.4s
+        sqrshl  v1.4s, v1.4s, v31.4s
+        sqadd   v0.4s, v0.4s, v29.4s
+        sqadd   v1.4s, v1.4s, v29.4s
+        sqxtn  v0.4h, v0.4s
+        sqxtn  v1.4h, v1.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        str     s0, [x0]
+        str     s1, [x0, x1]
+        add     x0, x0, x1, lsl 1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels6_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+        sub     x1, x1, #4
+1:
+        ldr     d0, [x2]
+        ldr     d1, [x2, x3]
+        add     x2, x2, x3, lsl 1
+        ushll   v0.8h, v0.8b, #6
+        ushll   v1.8h, v1.8b, #6
+        smull   v4.4s, v0.4h, v30.4h
+        smull2  v5.4s, v0.8h, v30.8h
+        smull   v6.4s, v1.4h, v30.4h
+        smull2  v7.4s, v1.8h, v30.8h
+        sqrshl  v4.4s, v4.4s, v31.4s
+        sqrshl  v5.4s, v5.4s, v31.4s
+        sqrshl  v6.4s, v6.4s, v31.4s
+        sqrshl  v7.4s, v7.4s, v31.4s
+        sqadd   v4.4s, v4.4s, v29.4s
+        sqadd   v5.4s, v5.4s, v29.4s
+        sqadd   v6.4s, v6.4s, v29.4s
+        sqadd   v7.4s, v7.4s, v29.4s
+        sqxtn   v0.4h, v4.4s
+        sqxtn2  v0.8h, v5.4s
+        sqxtn   v1.4h, v6.4s
+        sqxtn2  v1.8h, v7.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        str     s0, [x0], #4
+        st1     {v0.h}[2], [x0], x1
+        str     s1, [x0], #4
+        st1     {v1.h}[2], [x0], x1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret 
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ldr     d0, [x2]
+        ldr     d1, [x2, x3]
+        add     x2, x2, x3, lsl 1
+        ushll   v0.8h, v0.8b, #6
+        ushll   v1.8h, v1.8b, #6
+        smull   v4.4s, v0.4h, v30.4h
+        smull2  v5.4s, v0.8h, v30.8h
+        smull   v6.4s, v1.4h, v30.4h
+        smull2  v7.4s, v1.8h, v30.8h
+        sqrshl  v4.4s, v4.4s, v31.4s
+        sqrshl  v5.4s, v5.4s, v31.4s
+        sqrshl  v6.4s, v6.4s, v31.4s
+        sqrshl  v7.4s, v7.4s, v31.4s
+        sqadd   v4.4s, v4.4s, v29.4s
+        sqadd   v5.4s, v5.4s, v29.4s
+        sqadd   v6.4s, v6.4s, v29.4s
+        sqadd   v7.4s, v7.4s, v29.4s
+        sqxtn   v0.4h, v4.4s
+        sqxtn2  v0.8h, v5.4s
+        sqxtn   v1.4h, v6.4s
+        sqxtn2  v1.8h, v7.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        str     d0, [x0]
+        str     d1, [x0, x1]
+        add     x0, x0, x1, lsl 1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels12_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+        sub     x1, x1, #8
+1:
+        ldr     q0, [x2]
+        ldr     q1, [x2, x3]
+        add     x2, x2, x3, lsl 1
+        ushll   v4.8h, v0.8b, #6
+        ushll2  v5.8h, v0.16b, #6
+        ushll   v6.8h, v1.8b, #6
+        ushll2  v7.8h, v1.16b, #6
+        smull   v16.4s, v4.4h, v30.4h
+        smull2  v17.4s, v4.8h, v30.8h
+        smull   v18.4s, v5.4h, v30.4h
+        smull2  v19.4s, v5.8h, v30.8h
+        smull   v20.4s, v6.4h, v30.4h
+        smull2  v21.4s, v6.8h, v30.8h
+        smull   v22.4s, v7.4h, v30.4h
+        smull2  v23.4s, v7.8h, v30.8h
+        
+        sqrshl  v16.4s, v16.4s, v31.4s
+        sqrshl  v17.4s, v17.4s, v31.4s
+        sqrshl  v18.4s, v18.4s, v31.4s
+        sqrshl  v19.4s, v19.4s, v31.4s
+        sqrshl  v20.4s, v20.4s, v31.4s
+        sqrshl  v21.4s, v21.4s, v31.4s
+        sqrshl  v22.4s, v22.4s, v31.4s
+        sqrshl  v23.4s, v23.4s, v31.4s
+        sqadd   v16.4s, v16.4s, v29.4s
+        sqadd   v17.4s, v17.4s, v29.4s
+        sqadd   v18.4s, v18.4s, v29.4s
+        sqadd   v19.4s, v19.4s, v29.4s
+        sqadd   v20.4s, v20.4s, v29.4s
+        sqadd   v21.4s, v21.4s, v29.4s
+        sqadd   v22.4s, v22.4s, v29.4s
+        sqadd   v23.4s, v23.4s, v29.4s
+        sqxtn   v0.4h, v16.4s
+        sqxtn2  v0.8h, v17.4s
+        sqxtn   v1.4h, v18.4s
+        sqxtn2  v1.8h, v19.4s
+        sqxtn   v2.4h, v20.4s
+        sqxtn2  v2.8h, v21.4s
+        sqxtn   v3.4h, v22.4s
+        sqxtn2  v3.8h, v23.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun2 v0.16b, v1.8h
+        sqxtun  v2.8b, v2.8h
+        sqxtun2 v2.16b, v3.8h
+        str     d0, [x0], #8
+        st1     {v0.s}[2], [x0], x1
+        str     d2, [x0], #8
+        st1     {v2.s}[2], [x0], x1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+.macro PEL_UNI_W_PIXEL_CALC     s0, t0, t1, d0, d1, d2, d3
+        ushll   \t0\().8h, \s0\().8b, #6
+        ushll2  \t1\().8h, \s0\().16b, #6
+        smull   \d0\().4s, \t0\().4h, v30.4h
+        smull2  \d1\().4s, \t0\().8h, v30.8h
+        smull   \d2\().4s, \t1\().4h, v30.4h
+        smull2  \d3\().4s, \t1\().8h, v30.8h
+        sqrshl  \d0\().4s, \d0\().4s, v31.4s
+        sqrshl  \d1\().4s, \d1\().4s, v31.4s
+        sqrshl  \d2\().4s, \d2\().4s, v31.4s
+        sqrshl  \d3\().4s, \d3\().4s, v31.4s
+        sqadd   \d0\().4s, \d0\().4s, v29.4s
+        sqadd   \d1\().4s, \d1\().4s, v29.4s
+        sqadd   \d2\().4s, \d2\().4s, v29.4s
+        sqadd   \d3\().4s, \d3\().4s, v29.4s
+        sqxtn   \t0\().4h, \d0\().4s
+        sqxtn2  \t0\().8h, \d1\().4s
+        sqxtn   \t1\().4h, \d2\().4s
+        sqxtn2  \t1\().8h, \d3\().4s
+        sqxtun  \s0\().8b,  \t0\().8h
+        sqxtun2 \s0\().16b, \t1\().8h
+.endm
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels16_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ldr     q0, [x2]
+        ldr     q1, [x2, x3]
+        add     x2, x2, x3, lsl 1
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        str     q0, [x0]
+        str     q1, [x0, x1]
+        add     x0, x0, x1, lsl 1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels24_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ld1     {v0.16b, v1.16b}, [x2], x3
+        ushll   v4.8h, v0.8b, #6
+        ushll2  v5.8h, v0.16b, #6
+        ushll   v6.8h, v1.8b, #6
+        smull   v16.4s, v4.4h, v30.4h
+        smull2  v17.4s, v4.8h, v30.8h
+        smull   v18.4s, v5.4h, v30.4h
+        smull2  v19.4s, v5.8h, v30.8h
+        smull   v20.4s, v6.4h, v30.4h
+        smull2  v21.4s, v6.8h, v30.8h
+        sqrshl  v16.4s, v16.4s, v31.4s
+        sqrshl  v17.4s, v17.4s, v31.4s
+        sqrshl  v18.4s, v18.4s, v31.4s
+        sqrshl  v19.4s, v19.4s, v31.4s
+        sqrshl  v20.4s, v20.4s, v31.4s
+        sqrshl  v21.4s, v21.4s, v31.4s
+        sqadd   v16.4s, v16.4s, v29.4s
+        sqadd   v17.4s, v17.4s, v29.4s
+        sqadd   v18.4s, v18.4s, v29.4s
+        sqadd   v19.4s, v19.4s, v29.4s
+        sqadd   v20.4s, v20.4s, v29.4s
+        sqadd   v21.4s, v21.4s, v29.4s
+        sqxtn   v0.4h, v16.4s
+        sqxtn2  v0.8h, v17.4s
+        sqxtn   v1.4h, v18.4s
+        sqxtn2  v1.8h, v19.4s
+        sqxtn   v2.4h, v20.4s
+        sqxtn2  v2.8h, v21.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        sqxtun  v2.8b, v2.8h
+        st1     {v0.8b, v1.8b, v2.8b}, [x0], x1
+        subs    w4, w4, #1
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels32_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ld1     {v0.16b, v1.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        st1     {v0.16b, v1.16b}, [x0], x1
+        subs    w4, w4, #1
+        b.ne    1b
+        ret
+endfunc
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels48_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ld1     {v0.16b, v1.16b, v2.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
+        st1     {v0.16b, v1.16b, v2.16b}, [x0], x1
+        subs    w4, w4, #1
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels64_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ld1     {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v3, v6, v7, v20, v21, v22, v23
+        st1     {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+        subs    w4, w4, #1
+        b.ne    1b
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_HEADER
+        ldur            x12, [sp, #8]          // my
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        movrel          x9, qpel_filters_abs
+        add             x9, x9, x12, lsl 3
+        ldr             d28, [x9]
+        dup             v0.16b, v28.b[0]
+        dup             v1.16b, v28.b[1]
+        dup             v2.16b, v28.b[2]
+        dup             v3.16b, v28.b[3]
+        dup             v4.16b, v28.b[4]
+        dup             v5.16b, v28.b[5]
+        dup             v6.16b, v28.b[6]
+        dup             v7.16b, v28.b[7]
+
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6              // wx
+        dup             v31.4s, w10             // shift
+        dup             v29.4s, w7              // ox
+.endm
+
+.macro QPEL_FILTER_B dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umull           \dst\().8h, \src1\().8b, v1.8b
+        umlsl           \dst\().8h, \src0\().8b, v0.8b
+        umlsl           \dst\().8h, \src2\().8b, v2.8b
+        umlal           \dst\().8h, \src3\().8b, v3.8b
+        umlal           \dst\().8h, \src4\().8b, v4.8b
+        umlsl           \dst\().8h, \src5\().8b, v5.8b
+        umlal           \dst\().8h, \src6\().8b, v6.8b
+        umlsl           \dst\().8h, \src7\().8b, v7.8b
+.endm
+
+.macro QPEL_FILTER_B2 dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umull2          \dst\().8h, \src1\().16b, v1.16b
+        umlsl2          \dst\().8h, \src0\().16b, v0.16b
+        umlsl2          \dst\().8h, \src2\().16b, v2.16b
+        umlal2          \dst\().8h, \src3\().16b, v3.16b
+        umlal2          \dst\().8h, \src4\().16b, v4.16b
+        umlsl2          \dst\().8h, \src5\().16b, v5.16b
+        umlal2          \dst\().8h, \src6\().16b, v6.16b
+        umlsl2          \dst\().8h, \src7\().16b, v7.16b
+.endm
+
+.macro  QPEL_UNI_W_V_4
+        smull           v24.4s, v24.4h, v30.4h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtun          v24.8b, v24.8h
+        st1             {v24.s}[0], [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v4_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             s16, [x2]
+        ldr             s17, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             s18, [x2]
+        ldr             s19, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             s20, [x2]
+        ldr             s21, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             s22, [x2]
+
+1:      ldr             s23, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v24, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s16, [x2]
+        QPEL_FILTER_B     v24, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s17, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v24, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s18, [x2]
+        QPEL_FILTER_B     v24, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s19, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v24, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s20, [x2]
+        QPEL_FILTER_B     v24, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s21, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v24, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s22, [x2]
+        QPEL_FILTER_B     v24, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_8
+        smull           v24.4s, v26.4h, v30.4h
+        smull2          v25.4s, v26.8h, v30.8h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqrshl          v25.4s, v25.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqadd           v25.4s, v25.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtn2          v24.8h, v25.4s
+        sqxtun          v24.8b, v24.8h
+        st1             {v24.d}[0], [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v8_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             d16, [x2]
+        ldr             d17, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             d18, [x2]
+        ldr             d19, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             d20, [x2]
+        ldr             d21, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             d22, [x2]
+
+1:      ldr             d23, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d16, [x2]
+        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d17, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d18, [x2]
+        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d19, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d20, [x2]
+        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d21, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d22, [x2]
+        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_16
+        smull           v24.4s, v26.4h, v30.4h
+        smull2          v25.4s, v26.8h, v30.8h
+        smull           v26.4s, v27.4h, v30.4h
+        smull2          v27.4s, v27.8h, v30.8h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqrshl          v25.4s, v25.4s, v31.4s
+        sqrshl          v26.4s, v26.4s, v31.4s
+        sqrshl          v27.4s, v27.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqadd           v25.4s, v25.4s, v29.4s
+        sqadd           v26.4s, v26.4s, v29.4s
+        sqadd           v27.4s, v27.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtn2          v24.8h, v25.4s
+        sqxtn           v26.4h, v26.4s
+        sqxtn2          v26.8h, v27.4s
+        sqxtun          v24.8b, v24.8h
+        sqxtun2         v24.16b, v26.8h
+        st1             {v24.16b}, [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v16_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             q16, [x2]
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             q18, [x2]
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             q20, [x2]
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             q22, [x2]
+
+1:      ldr             q23, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_B2    v27, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q16, [x2]
+        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_B2    v27, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_B2    v27, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q18, [x2]
+        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_B2    v27, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_B2    v27, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q20, [x2]
+        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_B2    v27, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_B2    v27, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q22, [x2]
+        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_B2    v27, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldur            w13, [sp, #16]
+        mov             x14, x0
+        mov             x15, x2
+        mov             w11, w4
+
+3:
+        ldr             q16, [x2]
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             q18, [x2]
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             q20, [x2]
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             q22, [x2]
+
+
+1:      ldr             q23, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_B2    v27, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q16, [x2]
+        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_B2    v27, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_B2    v27, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q18, [x2]
+        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_B2    v27, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_B2    v27, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q20, [x2]
+        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_B2    v27, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_B2    v27, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q22, [x2]
+        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_B2    v27, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        subs            w13, w13, #16
+        add             x14, x14, #16
+        add             x15, x15, #16
+        mov             x0, x14
+        mov             x2, x15
+        mov             w4, w11
+        b.hi            3b
+        ret
+endfunc
Martin Storsjö May 4, 2023, 8:49 a.m. UTC | #3
On Wed, 3 May 2023, myais wrote:

> Hello,
>
> - I splited this patch, Do I need to resubmit or just attach them as 
> attachments? (I attached those patches.  If I need to resummit, please let me 
> know.)

The attached form here is fine with me.

I didn't review it in detail yet, but I think it roughly looks ok, but 
there's a few issues.

The "fate-hevc" tests as part of our testsuite fails with these patches 
applied - please test that and make sure it passes.

The assembly fails to build with MSVC, with errors like these:

libavcodec\aarch64\hevcdsp_qpel_neon.o.asm(2278) 
: error A2173: syntax error in expression
         add     x2, x2, x3, lsl 1

The immediate constant 1 here should be prefixed with #, the same thing 
goes in a lot of other places in the same file.

If compiling only patch 1, it fails to build due to a mismatched #endif in 
the c file.

The second patch adds code that git flags as "No newline at end of file"; 
please don't do that, please make sure your editor saves the file as usual 
with a trailing newline.

The patches have some cases of trailing whitespace, please make sure you 
don't have time.

In the second patch, you're inconsistently using "#if 
__ARM_FEATURE_DOTPROD" and "#if defined(__ARM_FEATURE_DOTPROD)".

Dot product is a new feature we haven't taken advantage of on aarch64 
before. None of my toolchains/environments have this enabled by default. 
It would be good if you'd provide examples of where you're testing it and 
how you configure the build to enable it.

Because right now, most of this assembly runs untested unless building in 
a specific configuration that explicitly enables that extension.

For such features, we generally would want to always compile/assemble the 
feature (as long as the toolchain supports assembling it, with some extra 
flag), and use runtime checks for detecting whether the cpu feature is 
supported. But I guess adding support for that is a bigger separate 
project, and this approach with build time ifdefs if the toolchain 
mandates support for it, is tolerable for now.

If there's not a huge benefit from the dot product instructions, maybe it 
would be best to just not use them, so the whole wide audience can benefit 
from the optimizations?

// Martin
Logan.Lyu May 5, 2023, 3:27 p.m. UTC | #4
Hi Martin,

I updated these patches based on your comments, please help to review it 
again.  And My reply is as follows :


在 2023/5/4 16:49, Martin Storsjö 写道:
> On Wed, 3 May 2023, myais wrote:
>
>>> Hello,
>>>
>>> - I splited this patch, Do I need to resubmit or just attach them as 
>>> attachments? (I attached those patches.  If I need to resummit, 
>>> please let me know.)
>>
>> The attached form here is fine with me.
>>
>> I didn't review it in detail yet, but I think it roughly looks ok, 
>> but there's a few issues.
>>
>> The "fate-hevc" tests as part of our testsuite fails with these 
>> patches applied - please test that and make sure it passes.

I'm having some trouble downloading fate-suite using rsync, and I can't 
download it now, It may be caused by network problems. but I tested the 
decoding of hevc (h265) files myself, and it can be decoded into yuv 
files normally, and I will continue to try to download fate-suite and 
test it soon.

>>
>> The assembly fails to build with MSVC, with errors like these:
>>
>> libavcodec\aarch64\hevcdsp_qpel_neon.o.asm(2278) : error A2173: 
>> syntax error in expression
>>         add     x2, x2, x3, lsl 1
>>
>> The immediate constant 1 here should be prefixed with #, the same 
>> thing goes in a lot of other places in the same file.
- immediate constant prefixed with # ---fixed.
>>
>> If compiling only patch 1, it fails to build due to a mismatched 
>> #endif in the c file.
- mismatched #endif ---fixed.
>>
>> The second patch adds code that git flags as "No newline at end of 
>> file"; please don't do that, please make sure your editor saves the 
>> file as usual with a trailing newline.
- trailing newline --- fixed.
>>
>> The patches have some cases of trailing whitespace, please make sure 
>> you don't have time.
- trailing whitespace ---fixed.
>>
>> In the second patch, you're inconsistently using "#if 
>> __ARM_FEATURE_DOTPROD" and "#if defined(__ARM_FEATURE_DOTPROD)".
-  "#if defined(__ARM_FEATURE_DOTPROD)" is now uniformly used.
>>
>> Dot product is a new feature we haven't taken advantage of on aarch64 
>> before. None of my toolchains/environments have this enabled by 
>> default. It would be good if you'd provide examples of where you're 
>> testing it and how you configure the build to enable it.
Dot product is an Optional instruntion from Armv8.2 to Armv8.5, and from 
armv8.6 it is mandatory for implementations. so it can be enable by add 
flags "-march=armv8.6".
>>
>> Because right now, most of this assembly runs untested unless 
>> building in a specific configuration that explicitly enables that 
>> extension.
>>
>> For such features, we generally would want to always compile/assemble 
>> the feature (as long as the toolchain supports assembling it, with 
>> some extra flag), and use runtime checks for detecting whether the 
>> cpu feature is supported. But I guess adding support for that is a 
>> bigger separate project, and this approach with build time ifdefs if 
>> the toolchain mandates support for it, is tolerable for now.
>>
>> If there's not a huge benefit from the dot product instructions, 
>> maybe it would be best to just not use them, so the whole wide 
>> audience can benefit from the optimizations? 

Yes, for calculations in the horizontal direction, the dot product 
command can bring a relatively large improvement, compared with the MLA 
instructions.

Perhaps in the future,  a more general version can be implemented using 
the mla instruction.

>
>
> // Martin
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
From 02a405dddd970f9d5fe60a3dc4ab61d1869bb230 Mon Sep 17 00:00:00 2001
From: myais <Logan.Lyu@myais.com.cn>
Date: Wed, 3 May 2023 09:53:07 +0800
Subject: [PATCH v1 1/3] lavc/aarch64: new optimization for 8-bit
 hevc_pel_uni_w_pixels and qpel_uni_w_v

---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  53 ++
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 710 ++++++++++++++++++++++
 2 files changed, 763 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index be1049a2ec..c68612bd98 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -128,6 +128,55 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co
                                          ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
                                          mx, intptr_t my, int width);
 
+#define NEON8_FNPROTO(fn, args, ext) \
+    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##6_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##12_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##24_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##48_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+
+#define NEON8_FNPROTO_PARTIAL_4(fn, args, ext) \
+    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+
+NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width),);
+
+NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width),);
+
+#define NEON8_FNASSIGN(member, v, h, fn, ext) \
+        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
+        member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext;  \
+        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[4][v][h] = ff_hevc_put_hevc_##fn##12_8_neon##ext; \
+        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[6][v][h] = ff_hevc_put_hevc_##fn##24_8_neon##ext; \
+        member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
+        member[8][v][h] = ff_hevc_put_hevc_##fn##48_8_neon##ext; \
+        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
+
+#define NEON8_FNASSIGN_PARTIAL_4(member, v, h, fn, ext) \
+        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
+        member[2][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[4][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[6][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
+        member[7][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
+        member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
+        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
+
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
     if (!have_neon(av_get_cpu_flags())) return;
@@ -185,6 +234,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->put_hevc_qpel_bi[7][0][1]   =
         c->put_hevc_qpel_bi[8][0][1]   =
         c->put_hevc_qpel_bi[9][0][1]   = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
+
+        NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
+        NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
+
     }
     if (bit_depth == 10) {
         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 0e7b912678..95d798773d 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -30,6 +30,13 @@ const qpel_filters, align=4
         .byte           0,  1, -5, 17, 58,-10, 4, -1
 endconst
 
+const qpel_filters_abs, align=4
+        .byte           0,  0,  0,  0,  0,  0, 0,  0
+        .byte           1,  4, 10, 58, 17,  5, 1,  0
+        .byte           1,  4, 11, 40, 40, 11, 4,  1
+        .byte           0,  1,  5, 17, 58, 10, 4,  1
+endconst
+
 .macro load_filter m
         movrel          x15, qpel_filters
         add             x15, x15, \m, lsl #3
@@ -482,3 +489,706 @@ endfunc
 put_hevc qpel
 put_hevc qpel_uni
 put_hevc qpel_bi
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.8h, w7
+1:
+        ldr     s0, [x2]
+        ldr     s1, [x2, x3]
+        add     x2, x2, x3, lsl 1
+        ushll   v0.8h, v0.8b, #6
+        ushll   v1.8h, v1.8b, #6
+        smull   v0.4s, v0.4h, v30.4h
+        smull   v1.4s, v1.4h, v30.4h
+        sqrshl  v0.4s, v0.4s, v31.4s
+        sqrshl  v1.4s, v1.4s, v31.4s
+        sqadd   v0.4s, v0.4s, v29.4s
+        sqadd   v1.4s, v1.4s, v29.4s
+        sqxtn  v0.4h, v0.4s
+        sqxtn  v1.4h, v1.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        str     s0, [x0]
+        str     s1, [x0, x1]
+        add     x0, x0, x1, lsl #1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels6_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+        sub     x1, x1, #4
+1:
+        ldr     d0, [x2]
+        ldr     d1, [x2, x3]
+        add     x2, x2, x3, lsl #1
+        ushll   v0.8h, v0.8b, #6
+        ushll   v1.8h, v1.8b, #6
+        smull   v4.4s, v0.4h, v30.4h
+        smull2  v5.4s, v0.8h, v30.8h
+        smull   v6.4s, v1.4h, v30.4h
+        smull2  v7.4s, v1.8h, v30.8h
+        sqrshl  v4.4s, v4.4s, v31.4s
+        sqrshl  v5.4s, v5.4s, v31.4s
+        sqrshl  v6.4s, v6.4s, v31.4s
+        sqrshl  v7.4s, v7.4s, v31.4s
+        sqadd   v4.4s, v4.4s, v29.4s
+        sqadd   v5.4s, v5.4s, v29.4s
+        sqadd   v6.4s, v6.4s, v29.4s
+        sqadd   v7.4s, v7.4s, v29.4s
+        sqxtn   v0.4h, v4.4s
+        sqxtn2  v0.8h, v5.4s
+        sqxtn   v1.4h, v6.4s
+        sqxtn2  v1.8h, v7.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        str     s0, [x0], #4
+        st1     {v0.h}[2], [x0], x1
+        str     s1, [x0], #4
+        st1     {v1.h}[2], [x0], x1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ldr     d0, [x2]
+        ldr     d1, [x2, x3]
+        add     x2, x2, x3, lsl #1
+        ushll   v0.8h, v0.8b, #6
+        ushll   v1.8h, v1.8b, #6
+        smull   v4.4s, v0.4h, v30.4h
+        smull2  v5.4s, v0.8h, v30.8h
+        smull   v6.4s, v1.4h, v30.4h
+        smull2  v7.4s, v1.8h, v30.8h
+        sqrshl  v4.4s, v4.4s, v31.4s
+        sqrshl  v5.4s, v5.4s, v31.4s
+        sqrshl  v6.4s, v6.4s, v31.4s
+        sqrshl  v7.4s, v7.4s, v31.4s
+        sqadd   v4.4s, v4.4s, v29.4s
+        sqadd   v5.4s, v5.4s, v29.4s
+        sqadd   v6.4s, v6.4s, v29.4s
+        sqadd   v7.4s, v7.4s, v29.4s
+        sqxtn   v0.4h, v4.4s
+        sqxtn2  v0.8h, v5.4s
+        sqxtn   v1.4h, v6.4s
+        sqxtn2  v1.8h, v7.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        str     d0, [x0]
+        str     d1, [x0, x1]
+        add     x0, x0, x1, lsl #1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels12_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+        sub     x1, x1, #8
+1:
+        ldr     q0, [x2]
+        ldr     q1, [x2, x3]
+        add     x2, x2, x3, lsl #1
+        ushll   v4.8h, v0.8b, #6
+        ushll2  v5.8h, v0.16b, #6
+        ushll   v6.8h, v1.8b, #6
+        ushll2  v7.8h, v1.16b, #6
+        smull   v16.4s, v4.4h, v30.4h
+        smull2  v17.4s, v4.8h, v30.8h
+        smull   v18.4s, v5.4h, v30.4h
+        smull2  v19.4s, v5.8h, v30.8h
+        smull   v20.4s, v6.4h, v30.4h
+        smull2  v21.4s, v6.8h, v30.8h
+        smull   v22.4s, v7.4h, v30.4h
+        smull2  v23.4s, v7.8h, v30.8h
+
+        sqrshl  v16.4s, v16.4s, v31.4s
+        sqrshl  v17.4s, v17.4s, v31.4s
+        sqrshl  v18.4s, v18.4s, v31.4s
+        sqrshl  v19.4s, v19.4s, v31.4s
+        sqrshl  v20.4s, v20.4s, v31.4s
+        sqrshl  v21.4s, v21.4s, v31.4s
+        sqrshl  v22.4s, v22.4s, v31.4s
+        sqrshl  v23.4s, v23.4s, v31.4s
+        sqadd   v16.4s, v16.4s, v29.4s
+        sqadd   v17.4s, v17.4s, v29.4s
+        sqadd   v18.4s, v18.4s, v29.4s
+        sqadd   v19.4s, v19.4s, v29.4s
+        sqadd   v20.4s, v20.4s, v29.4s
+        sqadd   v21.4s, v21.4s, v29.4s
+        sqadd   v22.4s, v22.4s, v29.4s
+        sqadd   v23.4s, v23.4s, v29.4s
+        sqxtn   v0.4h, v16.4s
+        sqxtn2  v0.8h, v17.4s
+        sqxtn   v1.4h, v18.4s
+        sqxtn2  v1.8h, v19.4s
+        sqxtn   v2.4h, v20.4s
+        sqxtn2  v2.8h, v21.4s
+        sqxtn   v3.4h, v22.4s
+        sqxtn2  v3.8h, v23.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun2 v0.16b, v1.8h
+        sqxtun  v2.8b, v2.8h
+        sqxtun2 v2.16b, v3.8h
+        str     d0, [x0], #8
+        st1     {v0.s}[2], [x0], x1
+        str     d2, [x0], #8
+        st1     {v2.s}[2], [x0], x1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+.macro PEL_UNI_W_PIXEL_CALC     s0, t0, t1, d0, d1, d2, d3
+        ushll   \t0\().8h, \s0\().8b, #6
+        ushll2  \t1\().8h, \s0\().16b, #6
+        smull   \d0\().4s, \t0\().4h, v30.4h
+        smull2  \d1\().4s, \t0\().8h, v30.8h
+        smull   \d2\().4s, \t1\().4h, v30.4h
+        smull2  \d3\().4s, \t1\().8h, v30.8h
+        sqrshl  \d0\().4s, \d0\().4s, v31.4s
+        sqrshl  \d1\().4s, \d1\().4s, v31.4s
+        sqrshl  \d2\().4s, \d2\().4s, v31.4s
+        sqrshl  \d3\().4s, \d3\().4s, v31.4s
+        sqadd   \d0\().4s, \d0\().4s, v29.4s
+        sqadd   \d1\().4s, \d1\().4s, v29.4s
+        sqadd   \d2\().4s, \d2\().4s, v29.4s
+        sqadd   \d3\().4s, \d3\().4s, v29.4s
+        sqxtn   \t0\().4h, \d0\().4s
+        sqxtn2  \t0\().8h, \d1\().4s
+        sqxtn   \t1\().4h, \d2\().4s
+        sqxtn2  \t1\().8h, \d3\().4s
+        sqxtun  \s0\().8b,  \t0\().8h
+        sqxtun2 \s0\().16b, \t1\().8h
+.endm
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels16_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ldr     q0, [x2]
+        ldr     q1, [x2, x3]
+        add     x2, x2, x3, lsl #1
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        str     q0, [x0]
+        str     q1, [x0, x1]
+        add     x0, x0, x1, lsl #1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels24_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ld1     {v0.16b, v1.16b}, [x2], x3
+        ushll   v4.8h, v0.8b, #6
+        ushll2  v5.8h, v0.16b, #6
+        ushll   v6.8h, v1.8b, #6
+        smull   v16.4s, v4.4h, v30.4h
+        smull2  v17.4s, v4.8h, v30.8h
+        smull   v18.4s, v5.4h, v30.4h
+        smull2  v19.4s, v5.8h, v30.8h
+        smull   v20.4s, v6.4h, v30.4h
+        smull2  v21.4s, v6.8h, v30.8h
+        sqrshl  v16.4s, v16.4s, v31.4s
+        sqrshl  v17.4s, v17.4s, v31.4s
+        sqrshl  v18.4s, v18.4s, v31.4s
+        sqrshl  v19.4s, v19.4s, v31.4s
+        sqrshl  v20.4s, v20.4s, v31.4s
+        sqrshl  v21.4s, v21.4s, v31.4s
+        sqadd   v16.4s, v16.4s, v29.4s
+        sqadd   v17.4s, v17.4s, v29.4s
+        sqadd   v18.4s, v18.4s, v29.4s
+        sqadd   v19.4s, v19.4s, v29.4s
+        sqadd   v20.4s, v20.4s, v29.4s
+        sqadd   v21.4s, v21.4s, v29.4s
+        sqxtn   v0.4h, v16.4s
+        sqxtn2  v0.8h, v17.4s
+        sqxtn   v1.4h, v18.4s
+        sqxtn2  v1.8h, v19.4s
+        sqxtn   v2.4h, v20.4s
+        sqxtn2  v2.8h, v21.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        sqxtun  v2.8b, v2.8h
+        st1     {v0.8b, v1.8b, v2.8b}, [x0], x1
+        subs    w4, w4, #1
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels32_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ld1     {v0.16b, v1.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        st1     {v0.16b, v1.16b}, [x0], x1
+        subs    w4, w4, #1
+        b.ne    1b
+        ret
+endfunc
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels48_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ld1     {v0.16b, v1.16b, v2.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
+        st1     {v0.16b, v1.16b, v2.16b}, [x0], x1
+        subs    w4, w4, #1
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels64_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ld1     {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v3, v6, v7, v20, v21, v22, v23
+        st1     {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+        subs    w4, w4, #1
+        b.ne    1b
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_HEADER
+        ldur            x12, [sp, #8]          // my
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        movrel          x9, qpel_filters_abs
+        add             x9, x9, x12, lsl #3
+        ldr             d28, [x9]
+        dup             v0.16b, v28.b[0]
+        dup             v1.16b, v28.b[1]
+        dup             v2.16b, v28.b[2]
+        dup             v3.16b, v28.b[3]
+        dup             v4.16b, v28.b[4]
+        dup             v5.16b, v28.b[5]
+        dup             v6.16b, v28.b[6]
+        dup             v7.16b, v28.b[7]
+
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6              // wx
+        dup             v31.4s, w10             // shift
+        dup             v29.4s, w7              // ox
+.endm
+
+.macro QPEL_FILTER_B dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umull           \dst\().8h, \src1\().8b, v1.8b
+        umlsl           \dst\().8h, \src0\().8b, v0.8b
+        umlsl           \dst\().8h, \src2\().8b, v2.8b
+        umlal           \dst\().8h, \src3\().8b, v3.8b
+        umlal           \dst\().8h, \src4\().8b, v4.8b
+        umlsl           \dst\().8h, \src5\().8b, v5.8b
+        umlal           \dst\().8h, \src6\().8b, v6.8b
+        umlsl           \dst\().8h, \src7\().8b, v7.8b
+.endm
+
+.macro QPEL_FILTER_B2 dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umull2          \dst\().8h, \src1\().16b, v1.16b
+        umlsl2          \dst\().8h, \src0\().16b, v0.16b
+        umlsl2          \dst\().8h, \src2\().16b, v2.16b
+        umlal2          \dst\().8h, \src3\().16b, v3.16b
+        umlal2          \dst\().8h, \src4\().16b, v4.16b
+        umlsl2          \dst\().8h, \src5\().16b, v5.16b
+        umlal2          \dst\().8h, \src6\().16b, v6.16b
+        umlsl2          \dst\().8h, \src7\().16b, v7.16b
+.endm
+
+.macro  QPEL_UNI_W_V_4
+        smull           v24.4s, v24.4h, v30.4h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtun          v24.8b, v24.8h
+        st1             {v24.s}[0], [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v4_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             s16, [x2]
+        ldr             s17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s18, [x2]
+        ldr             s19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s20, [x2]
+        ldr             s21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s22, [x2]
+
+1:      ldr             s23, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v24, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s16, [x2]
+        QPEL_FILTER_B     v24, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v24, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s18, [x2]
+        QPEL_FILTER_B     v24, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v24, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s20, [x2]
+        QPEL_FILTER_B     v24, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v24, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s22, [x2]
+        QPEL_FILTER_B     v24, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_8
+        smull           v24.4s, v26.4h, v30.4h
+        smull2          v25.4s, v26.8h, v30.8h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqrshl          v25.4s, v25.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqadd           v25.4s, v25.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtn2          v24.8h, v25.4s
+        sqxtun          v24.8b, v24.8h
+        st1             {v24.d}[0], [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v8_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             d16, [x2]
+        ldr             d17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d18, [x2]
+        ldr             d19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d20, [x2]
+        ldr             d21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d22, [x2]
+
+1:      ldr             d23, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d16, [x2]
+        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d18, [x2]
+        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d20, [x2]
+        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d22, [x2]
+        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_16
+        smull           v24.4s, v26.4h, v30.4h
+        smull2          v25.4s, v26.8h, v30.8h
+        smull           v26.4s, v27.4h, v30.4h
+        smull2          v27.4s, v27.8h, v30.8h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqrshl          v25.4s, v25.4s, v31.4s
+        sqrshl          v26.4s, v26.4s, v31.4s
+        sqrshl          v27.4s, v27.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqadd           v25.4s, v25.4s, v29.4s
+        sqadd           v26.4s, v26.4s, v29.4s
+        sqadd           v27.4s, v27.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtn2          v24.8h, v25.4s
+        sqxtn           v26.4h, v26.4s
+        sqxtn2          v26.8h, v27.4s
+        sqxtun          v24.8b, v24.8h
+        sqxtun2         v24.16b, v26.8h
+        st1             {v24.16b}, [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v16_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             q16, [x2]
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q18, [x2]
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q20, [x2]
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q22, [x2]
+
+1:      ldr             q23, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_B2    v27, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q16, [x2]
+        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_B2    v27, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_B2    v27, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q18, [x2]
+        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_B2    v27, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_B2    v27, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q20, [x2]
+        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_B2    v27, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_B2    v27, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q22, [x2]
+        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_B2    v27, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldur            w13, [sp, #16]
+        mov             x14, x0
+        mov             x15, x2
+        mov             w11, w4
+
+3:
+        ldr             q16, [x2]
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q18, [x2]
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q20, [x2]
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q22, [x2]
+
+
+1:      ldr             q23, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_B2    v27, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q16, [x2]
+        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_B2    v27, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_B2    v27, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q18, [x2]
+        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_B2    v27, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_B2    v27, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q20, [x2]
+        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_B2    v27, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_B2    v27, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q22, [x2]
+        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_B2    v27, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        subs            w13, w13, #16
+        add             x14, x14, #16
+        add             x15, x15, #16
+        mov             x0, x14
+        mov             x2, x15
+        mov             w4, w11
+        b.hi            3b
+        ret
+endfunc
Logan.Lyu May 7, 2023, 4:52 a.m. UTC | #5
Hi, Martin,

I updated the patches again.  These patches passed fate-hevc tests . 
fate-hevc did help to find some bugs, which have been fixed now, please 
help to review again.

Thanks.


在 2023/5/5 23:27, myais 写道:
> Hi Martin,
>
> I updated these patches based on your comments, please help to review 
> it again.  And My reply is as follows :
>
>
> 在 2023/5/4 16:49, Martin Storsjö 写道:
>> On Wed, 3 May 2023, myais wrote:
>>
>>>> Hello,
>>>>
>>>> - I splited this patch, Do I need to resubmit or just attach them 
>>>> as attachments? (I attached those patches.  If I need to resummit, 
>>>> please let me know.)
>>>
>>> The attached form here is fine with me.
>>>
>>> I didn't review it in detail yet, but I think it roughly looks ok, 
>>> but there's a few issues.
>>>
>>> The "fate-hevc" tests as part of our testsuite fails with these 
>>> patches applied - please test that and make sure it passes.
>
> I'm having some trouble downloading fate-suite using rsync, and I 
> can't download it now, It may be caused by network problems. but I 
> tested the decoding of hevc (h265) files myself, and it can be decoded 
> into yuv files normally, and I will continue to try to download 
> fate-suite and test it soon.
>
>>>
>>> The assembly fails to build with MSVC, with errors like these:
>>>
>>> libavcodec\aarch64\hevcdsp_qpel_neon.o.asm(2278) : error A2173: 
>>> syntax error in expression
>>>         add     x2, x2, x3, lsl 1
>>>
>>> The immediate constant 1 here should be prefixed with #, the same 
>>> thing goes in a lot of other places in the same file.
> - immediate constant prefixed with # ---fixed.
>>>
>>> If compiling only patch 1, it fails to build due to a mismatched 
>>> #endif in the c file.
> - mismatched #endif ---fixed.
>>>
>>> The second patch adds code that git flags as "No newline at end of 
>>> file"; please don't do that, please make sure your editor saves the 
>>> file as usual with a trailing newline.
> - trailing newline --- fixed.
>>>
>>> The patches have some cases of trailing whitespace, please make sure 
>>> you don't have time.
> - trailing whitespace ---fixed.
>>>
>>> In the second patch, you're inconsistently using "#if 
>>> __ARM_FEATURE_DOTPROD" and "#if defined(__ARM_FEATURE_DOTPROD)".
> -  "#if defined(__ARM_FEATURE_DOTPROD)" is now uniformly used.
>>>
>>> Dot product is a new feature we haven't taken advantage of on 
>>> aarch64 before. None of my toolchains/environments have this enabled 
>>> by default. It would be good if you'd provide examples of where 
>>> you're testing it and how you configure the build to enable it.
> Dot product is an Optional instruntion from Armv8.2 to Armv8.5, and 
> from armv8.6 it is mandatory for implementations. so it can be enable 
> by add flags "-march=armv8.6".
>>>
>>> Because right now, most of this assembly runs untested unless 
>>> building in a specific configuration that explicitly enables that 
>>> extension.
>>>
>>> For such features, we generally would want to always 
>>> compile/assemble the feature (as long as the toolchain supports 
>>> assembling it, with some extra flag), and use runtime checks for 
>>> detecting whether the cpu feature is supported. But I guess adding 
>>> support for that is a bigger separate project, and this approach 
>>> with build time ifdefs if the toolchain mandates support for it, is 
>>> tolerable for now.
>>>
>>> If there's not a huge benefit from the dot product instructions, 
>>> maybe it would be best to just not use them, so the whole wide 
>>> audience can benefit from the optimizations? 
>
> Yes, for calculations in the horizontal direction, the dot product 
> command can bring a relatively large improvement, compared with the 
> MLA instructions.
>
> Perhaps in the future,  a more general version can be implemented 
> using the mla instruction.
>
>>
>>
>> // Martin
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
From b08c4e571a49de109a0619fb9de9461af4901115 Mon Sep 17 00:00:00 2001
From: myais <Logan.Lyu@myais.com.cn>
Date: Wed, 3 May 2023 09:53:07 +0800
Subject: [PATCH 1/3] lavc/aarch64: new optimization for 8-bit
 hevc_pel_uni_w_pixels and qpel_uni_w_v

---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  51 ++
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 710 ++++++++++++++++++++++
 2 files changed, 761 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index be1049a2ec..6b5341dd45 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -128,6 +128,52 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co
                                          ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
                                          mx, intptr_t my, int width);
 
+#define NEON8_FNPROTO(fn, args, ext) \
+    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##6_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##12_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##24_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##48_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+
+#define NEON8_FNPROTO_PARTIAL_4(fn, args, ext) \
+    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+
+NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width),);
+
+NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width),);
+
+#define NEON8_FNASSIGN(member, v, h, fn, ext) \
+        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
+        member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext;  \
+        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[4][v][h] = ff_hevc_put_hevc_##fn##12_8_neon##ext; \
+        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[6][v][h] = ff_hevc_put_hevc_##fn##24_8_neon##ext; \
+        member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
+        member[8][v][h] = ff_hevc_put_hevc_##fn##48_8_neon##ext; \
+        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
+
+#define NEON8_FNASSIGN_PARTIAL_4(member, v, h, fn, ext) \
+        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
+        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[7][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
+        member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
+        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
+
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
     if (!have_neon(av_get_cpu_flags())) return;
@@ -185,6 +231,11 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->put_hevc_qpel_bi[7][0][1]   =
         c->put_hevc_qpel_bi[8][0][1]   =
         c->put_hevc_qpel_bi[9][0][1]   = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
+
+        NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
+        NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
+
     }
     if (bit_depth == 10) {
         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 0e7b912678..4783bf174b 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -30,6 +30,13 @@ const qpel_filters, align=4
         .byte           0,  1, -5, 17, 58,-10, 4, -1
 endconst
 
+const qpel_filters_abs, align=4
+        .byte           0,  0,  0,  0,  0,  0, 0,  0
+        .byte           1,  4, 10, 58, 17,  5, 1,  0
+        .byte           1,  4, 11, 40, 40, 11, 4,  1
+        .byte           0,  1,  5, 17, 58, 10, 4,  1
+endconst
+
 .macro load_filter m
         movrel          x15, qpel_filters
         add             x15, x15, \m, lsl #3
@@ -482,3 +489,706 @@ endfunc
 put_hevc qpel
 put_hevc qpel_uni
 put_hevc qpel_bi
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ldr     s0, [x2]
+        ldr     s1, [x2, x3]
+        add     x2, x2, x3, lsl #1
+        ushll   v0.8h, v0.8b, #6
+        ushll   v1.8h, v1.8b, #6
+        smull   v0.4s, v0.4h, v30.4h
+        smull   v1.4s, v1.4h, v30.4h
+        sqrshl  v0.4s, v0.4s, v31.4s
+        sqrshl  v1.4s, v1.4s, v31.4s
+        sqadd   v0.4s, v0.4s, v29.4s
+        sqadd   v1.4s, v1.4s, v29.4s
+        sqxtn  v0.4h, v0.4s
+        sqxtn  v1.4h, v1.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        str     s0, [x0]
+        str     s1, [x0, x1]
+        add     x0, x0, x1, lsl #1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels6_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+        sub     x1, x1, #4
+1:
+        ldr     d0, [x2]
+        ldr     d1, [x2, x3]
+        add     x2, x2, x3, lsl #1
+        ushll   v0.8h, v0.8b, #6
+        ushll   v1.8h, v1.8b, #6
+        smull   v4.4s, v0.4h, v30.4h
+        smull2  v5.4s, v0.8h, v30.8h
+        smull   v6.4s, v1.4h, v30.4h
+        smull2  v7.4s, v1.8h, v30.8h
+        sqrshl  v4.4s, v4.4s, v31.4s
+        sqrshl  v5.4s, v5.4s, v31.4s
+        sqrshl  v6.4s, v6.4s, v31.4s
+        sqrshl  v7.4s, v7.4s, v31.4s
+        sqadd   v4.4s, v4.4s, v29.4s
+        sqadd   v5.4s, v5.4s, v29.4s
+        sqadd   v6.4s, v6.4s, v29.4s
+        sqadd   v7.4s, v7.4s, v29.4s
+        sqxtn   v0.4h, v4.4s
+        sqxtn2  v0.8h, v5.4s
+        sqxtn   v1.4h, v6.4s
+        sqxtn2  v1.8h, v7.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        str     s0, [x0], #4
+        st1     {v0.h}[2], [x0], x1
+        str     s1, [x0], #4
+        st1     {v1.h}[2], [x0], x1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ldr     d0, [x2]
+        ldr     d1, [x2, x3]
+        add     x2, x2, x3, lsl #1
+        ushll   v0.8h, v0.8b, #6
+        ushll   v1.8h, v1.8b, #6
+        smull   v4.4s, v0.4h, v30.4h
+        smull2  v5.4s, v0.8h, v30.8h
+        smull   v6.4s, v1.4h, v30.4h
+        smull2  v7.4s, v1.8h, v30.8h
+        sqrshl  v4.4s, v4.4s, v31.4s
+        sqrshl  v5.4s, v5.4s, v31.4s
+        sqrshl  v6.4s, v6.4s, v31.4s
+        sqrshl  v7.4s, v7.4s, v31.4s
+        sqadd   v4.4s, v4.4s, v29.4s
+        sqadd   v5.4s, v5.4s, v29.4s
+        sqadd   v6.4s, v6.4s, v29.4s
+        sqadd   v7.4s, v7.4s, v29.4s
+        sqxtn   v0.4h, v4.4s
+        sqxtn2  v0.8h, v5.4s
+        sqxtn   v1.4h, v6.4s
+        sqxtn2  v1.8h, v7.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        str     d0, [x0]
+        str     d1, [x0, x1]
+        add     x0, x0, x1, lsl #1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels12_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+        sub     x1, x1, #8
+1:
+        ldr     q0, [x2]
+        ldr     q1, [x2, x3]
+        add     x2, x2, x3, lsl #1
+        ushll   v4.8h, v0.8b, #6
+        ushll2  v5.8h, v0.16b, #6
+        ushll   v6.8h, v1.8b, #6
+        ushll2  v7.8h, v1.16b, #6
+        smull   v16.4s, v4.4h, v30.4h
+        smull2  v17.4s, v4.8h, v30.8h
+        smull   v18.4s, v5.4h, v30.4h
+        smull2  v19.4s, v5.8h, v30.8h
+        smull   v20.4s, v6.4h, v30.4h
+        smull2  v21.4s, v6.8h, v30.8h
+        smull   v22.4s, v7.4h, v30.4h
+        smull2  v23.4s, v7.8h, v30.8h
+
+        sqrshl  v16.4s, v16.4s, v31.4s
+        sqrshl  v17.4s, v17.4s, v31.4s
+        sqrshl  v18.4s, v18.4s, v31.4s
+        sqrshl  v19.4s, v19.4s, v31.4s
+        sqrshl  v20.4s, v20.4s, v31.4s
+        sqrshl  v21.4s, v21.4s, v31.4s
+        sqrshl  v22.4s, v22.4s, v31.4s
+        sqrshl  v23.4s, v23.4s, v31.4s
+        sqadd   v16.4s, v16.4s, v29.4s
+        sqadd   v17.4s, v17.4s, v29.4s
+        sqadd   v18.4s, v18.4s, v29.4s
+        sqadd   v19.4s, v19.4s, v29.4s
+        sqadd   v20.4s, v20.4s, v29.4s
+        sqadd   v21.4s, v21.4s, v29.4s
+        sqadd   v22.4s, v22.4s, v29.4s
+        sqadd   v23.4s, v23.4s, v29.4s
+        sqxtn   v0.4h, v16.4s
+        sqxtn2  v0.8h, v17.4s
+        sqxtn   v1.4h, v18.4s
+        sqxtn2  v1.8h, v19.4s
+        sqxtn   v2.4h, v20.4s
+        sqxtn2  v2.8h, v21.4s
+        sqxtn   v3.4h, v22.4s
+        sqxtn2  v3.8h, v23.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun2 v0.16b, v1.8h
+        sqxtun  v2.8b, v2.8h
+        sqxtun2 v2.16b, v3.8h
+        str     d0, [x0], #8
+        st1     {v0.s}[2], [x0], x1
+        str     d2, [x0], #8
+        st1     {v2.s}[2], [x0], x1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+.macro PEL_UNI_W_PIXEL_CALC     s0, t0, t1, d0, d1, d2, d3
+        ushll   \t0\().8h, \s0\().8b, #6
+        ushll2  \t1\().8h, \s0\().16b, #6
+        smull   \d0\().4s, \t0\().4h, v30.4h
+        smull2  \d1\().4s, \t0\().8h, v30.8h
+        smull   \d2\().4s, \t1\().4h, v30.4h
+        smull2  \d3\().4s, \t1\().8h, v30.8h
+        sqrshl  \d0\().4s, \d0\().4s, v31.4s
+        sqrshl  \d1\().4s, \d1\().4s, v31.4s
+        sqrshl  \d2\().4s, \d2\().4s, v31.4s
+        sqrshl  \d3\().4s, \d3\().4s, v31.4s
+        sqadd   \d0\().4s, \d0\().4s, v29.4s
+        sqadd   \d1\().4s, \d1\().4s, v29.4s
+        sqadd   \d2\().4s, \d2\().4s, v29.4s
+        sqadd   \d3\().4s, \d3\().4s, v29.4s
+        sqxtn   \t0\().4h, \d0\().4s
+        sqxtn2  \t0\().8h, \d1\().4s
+        sqxtn   \t1\().4h, \d2\().4s
+        sqxtn2  \t1\().8h, \d3\().4s
+        sqxtun  \s0\().8b,  \t0\().8h
+        sqxtun2 \s0\().16b, \t1\().8h
+.endm
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels16_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ldr     q0, [x2]
+        ldr     q1, [x2, x3]
+        add     x2, x2, x3, lsl #1
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        str     q0, [x0]
+        str     q1, [x0, x1]
+        add     x0, x0, x1, lsl #1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels24_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ld1     {v0.16b, v1.16b}, [x2], x3
+        ushll   v4.8h, v0.8b, #6
+        ushll2  v5.8h, v0.16b, #6
+        ushll   v6.8h, v1.8b, #6
+        smull   v16.4s, v4.4h, v30.4h
+        smull2  v17.4s, v4.8h, v30.8h
+        smull   v18.4s, v5.4h, v30.4h
+        smull2  v19.4s, v5.8h, v30.8h
+        smull   v20.4s, v6.4h, v30.4h
+        smull2  v21.4s, v6.8h, v30.8h
+        sqrshl  v16.4s, v16.4s, v31.4s
+        sqrshl  v17.4s, v17.4s, v31.4s
+        sqrshl  v18.4s, v18.4s, v31.4s
+        sqrshl  v19.4s, v19.4s, v31.4s
+        sqrshl  v20.4s, v20.4s, v31.4s
+        sqrshl  v21.4s, v21.4s, v31.4s
+        sqadd   v16.4s, v16.4s, v29.4s
+        sqadd   v17.4s, v17.4s, v29.4s
+        sqadd   v18.4s, v18.4s, v29.4s
+        sqadd   v19.4s, v19.4s, v29.4s
+        sqadd   v20.4s, v20.4s, v29.4s
+        sqadd   v21.4s, v21.4s, v29.4s
+        sqxtn   v0.4h, v16.4s
+        sqxtn2  v0.8h, v17.4s
+        sqxtn   v1.4h, v18.4s
+        sqxtn2  v1.8h, v19.4s
+        sqxtn   v2.4h, v20.4s
+        sqxtn2  v2.8h, v21.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        sqxtun  v2.8b, v2.8h
+        st1     {v0.8b, v1.8b, v2.8b}, [x0], x1
+        subs    w4, w4, #1
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels32_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ld1     {v0.16b, v1.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        st1     {v0.16b, v1.16b}, [x0], x1
+        subs    w4, w4, #1
+        b.ne    1b
+        ret
+endfunc
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels48_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ld1     {v0.16b, v1.16b, v2.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
+        st1     {v0.16b, v1.16b, v2.16b}, [x0], x1
+        subs    w4, w4, #1
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels64_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ld1     {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v3, v6, v7, v20, v21, v22, v23
+        st1     {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+        subs    w4, w4, #1
+        b.ne    1b
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_HEADER
+        ldur            x12, [sp, #8]          // my
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        movrel          x9, qpel_filters_abs
+        add             x9, x9, x12, lsl #3
+        ldr             d28, [x9]
+        dup             v0.16b, v28.b[0]
+        dup             v1.16b, v28.b[1]
+        dup             v2.16b, v28.b[2]
+        dup             v3.16b, v28.b[3]
+        dup             v4.16b, v28.b[4]
+        dup             v5.16b, v28.b[5]
+        dup             v6.16b, v28.b[6]
+        dup             v7.16b, v28.b[7]
+
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6              // wx
+        dup             v31.4s, w10             // shift
+        dup             v29.4s, w7              // ox
+.endm
+
+.macro QPEL_FILTER_B dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umull           \dst\().8h, \src1\().8b, v1.8b
+        umlsl           \dst\().8h, \src0\().8b, v0.8b
+        umlsl           \dst\().8h, \src2\().8b, v2.8b
+        umlal           \dst\().8h, \src3\().8b, v3.8b
+        umlal           \dst\().8h, \src4\().8b, v4.8b
+        umlsl           \dst\().8h, \src5\().8b, v5.8b
+        umlal           \dst\().8h, \src6\().8b, v6.8b
+        umlsl           \dst\().8h, \src7\().8b, v7.8b
+.endm
+
+.macro QPEL_FILTER_B2 dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umull2          \dst\().8h, \src1\().16b, v1.16b
+        umlsl2          \dst\().8h, \src0\().16b, v0.16b
+        umlsl2          \dst\().8h, \src2\().16b, v2.16b
+        umlal2          \dst\().8h, \src3\().16b, v3.16b
+        umlal2          \dst\().8h, \src4\().16b, v4.16b
+        umlsl2          \dst\().8h, \src5\().16b, v5.16b
+        umlal2          \dst\().8h, \src6\().16b, v6.16b
+        umlsl2          \dst\().8h, \src7\().16b, v7.16b
+.endm
+
+.macro  QPEL_UNI_W_V_4
+        smull           v24.4s, v24.4h, v30.4h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtun          v24.8b, v24.8h
+        st1             {v24.s}[0], [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v4_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             s16, [x2]
+        ldr             s17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s18, [x2]
+        ldr             s19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s20, [x2]
+        ldr             s21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s22, [x2]
+
+1:      ldr             s23, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v24, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s16, [x2]
+        QPEL_FILTER_B     v24, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v24, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s18, [x2]
+        QPEL_FILTER_B     v24, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v24, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s20, [x2]
+        QPEL_FILTER_B     v24, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v24, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s22, [x2]
+        QPEL_FILTER_B     v24, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_8
+        smull           v24.4s, v26.4h, v30.4h
+        smull2          v25.4s, v26.8h, v30.8h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqrshl          v25.4s, v25.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqadd           v25.4s, v25.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtn2          v24.8h, v25.4s
+        sqxtun          v24.8b, v24.8h
+        st1             {v24.d}[0], [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v8_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             d16, [x2]
+        ldr             d17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d18, [x2]
+        ldr             d19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d20, [x2]
+        ldr             d21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d22, [x2]
+
+1:      ldr             d23, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d16, [x2]
+        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d18, [x2]
+        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d20, [x2]
+        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d22, [x2]
+        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_16
+        smull           v24.4s, v26.4h, v30.4h
+        smull2          v25.4s, v26.8h, v30.8h
+        smull           v26.4s, v27.4h, v30.4h
+        smull2          v27.4s, v27.8h, v30.8h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqrshl          v25.4s, v25.4s, v31.4s
+        sqrshl          v26.4s, v26.4s, v31.4s
+        sqrshl          v27.4s, v27.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqadd           v25.4s, v25.4s, v29.4s
+        sqadd           v26.4s, v26.4s, v29.4s
+        sqadd           v27.4s, v27.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtn2          v24.8h, v25.4s
+        sqxtn           v26.4h, v26.4s
+        sqxtn2          v26.8h, v27.4s
+        sqxtun          v24.8b, v24.8h
+        sqxtun2         v24.16b, v26.8h
+        st1             {v24.16b}, [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v16_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             q16, [x2]
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q18, [x2]
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q20, [x2]
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q22, [x2]
+
+1:      ldr             q23, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_B2    v27, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q16, [x2]
+        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_B2    v27, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_B2    v27, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q18, [x2]
+        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_B2    v27, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_B2    v27, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q20, [x2]
+        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_B2    v27, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_B2    v27, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q22, [x2]
+        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_B2    v27, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldur            w13, [sp, #16]
+        mov             x14, x0
+        mov             x15, x2
+        mov             w11, w4
+
+3:
+        ldr             q16, [x2]
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q18, [x2]
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q20, [x2]
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q22, [x2]
+
+
+1:      ldr             q23, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_B2    v27, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q16, [x2]
+        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_B2    v27, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_B2    v27, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q18, [x2]
+        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_B2    v27, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_B2    v27, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q20, [x2]
+        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_B2    v27, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_B2    v27, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q22, [x2]
+        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_B2    v27, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        subs            w13, w13, #16
+        add             x14, x14, #16
+        add             x15, x15, #16
+        mov             x0, x14
+        mov             x2, x15
+        mov             w4, w11
+        b.hi            3b
+        ret
+endfunc
Martin Storsjö May 26, 2023, 8:34 a.m. UTC | #6
Hi,

Overall these patches seem mostly ok, but I've got a few minor points to 
make:

- The usdot instruction requires the i8mm extension (part of armv8.6-a), 
while udot or sdot would require the dotprod extension (available in 
armv8.4-a). If you could manage with udot or sdot, these functions would 
be usable on a wider set of CPUs.

Therefore, the current guards are wrong. Also, I finally got support 
implemented for optionally using these cpu extensions, even if the 
baseline of the compile don't include it, by runtime enabling it. See the 
patchset at https://patchwork.ffmpeg.org/project/ffmpeg/list/?series=9009.

To adapt your patches on top of this, see the two topmost commits at 
https://github.com/mstorsjo/ffmpeg/commits/archext.

- The indentation is inconsistent; in the first patch, you have some 
instructions written like this:

+        sqadd   v1.4s, v1.4s, v29.4s

While you later use this style:

+        dup             v1.16b, v28.b[1]

The latter seems to match the style we commonly use; please reformat your 
code to match that consistently.

With some macro invocations in the first patch, you also seem to have too 
much indentation in some places. See e.g. this:

+1:      ldr             q23, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_B2    v27, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f

(If the macro name is too long, that's ok, but here there's no need to 
have those lines unaligned.)

- In the third patch, you've got multiple parameters from the stack like 
this:

+        ldp             x14, x15, [sp]          // mx, my
+        ldr             w13, [sp, #16]          // width

I see that the mx an my parameters are intptr_t; that's good, since if 
they would be 32 bit integers, the ABI for such parameters on the stack 
differ between macOS/Darwin and Linux. But as long as they're intptr_t 
they behave the same.

- At the same place, you're backing up a bunch of registers:

+        stp             x20, x21, [sp, #-16]!
+        stp             x22, x23, [sp, #-16]!
+        stp             x24, x25, [sp, #-16]!
+        stp             x26, x27, [sp, #-16]!
+        stp             x28, x30, [sp, #-16]!

This is inefficient; instead, do this:

+        stp             x28, x30, [sp, #-80]!
+        stp             x20, x21, [sp, #16]
+        stp             x22, x23, [sp, #32]
+        stp             x24, x25, [sp, #48]
+        stp             x26, x27, [sp, #64]

Also, following that, I see that you back up the stack pointer in x28. Why 
do you use x28 for that? Using x29 would be customary as frame pointer.

Aside for that, I think the rest of the patches is acceptable.

// Martin
Logan.Lyu May 27, 2023, 8:03 a.m. UTC | #7
yes, of course,   Re-add the missing mailing list...


在 2023/5/27 13:45, Martin Storsjö 写道:
> Hi,
>
> Thanks - can you send the new patches to the mailing list too? They 
> need to be available publicly for review before they can be accepted. 
> (I didn't check these yet.)
>
> // Martin
>
> On Sat, 27 May 2023, myais wrote:
>
>> Hi, Martin,
>>
>> Thank you for your correction, and I completed the modification 
>> according to your opinion, the attachments are the new patches.
>>
>>
>> Thanks.
>>
>>
>> 在 2023/5/24 20:49, Martin Storsjö 写道:
>>> Hi,
>>>
>>> On Tue, 23 May 2023, myais wrote:
>>>
>>>> Do you have any new opinions here? I am looking forward to your reply.
>>>
>>> I've started looking at this now after focusing on a different issue 
>>> first.
>>>
>>> The big thing is that this is the first new optional instruction set 
>>> on top of aarch64, so there's a bit of work to do to handle that 
>>> properly (with runtime detection, and assembling that code even if 
>>> the baseline target doesn't support it). I've started looking into 
>>> that now.
>>>
>>> In your case your patches don't care about that and just hardcode 
>>> enabling it if the compiler baseline support the instruction, and 
>>> skips it otherwise. I guess that's possibly fine, but your condition 
>>> for the code is wrong; the "usdot" instruction requires the "i8mm" 
>>> extension, not "dotprod". i8mm is part of armv8.6-a (and is 
>>> available on graviton 3, luckily, which allows me to test it).
>>>
>>> So instead of __ARM_FEATURE_DOTPROD, this should use 
>>> __ARM_FEATURE_MATMUL_INT8, and the functions should probably use 
>>> i8mm as suffix instead of dotprod. I guess you can resubmit them 
>>> with that change (and make sure you don't end up with the "no 
>>> trailing newline at the end of file" issue in the changed files in 
>>> any intermediate commit).
>>>
>>>> In addition, I have some new similar patches, which are the aarch64 
>>>> implementations of some other functions, should I wait for your 
>>>> feedback before submitting or submit it directly?
>>>
>>> I'd prefer to settle these patches first before taking on the next set.
>>>
>>> // Martin
>>
From b08c4e571a49de109a0619fb9de9461af4901115 Mon Sep 17 00:00:00 2001
From: myais <Logan.Lyu@myais.com.cn>
Date: Wed, 3 May 2023 09:53:07 +0800
Subject: [PATCH 1/3] lavc/aarch64: new optimization for 8-bit
 hevc_pel_uni_w_pixels and qpel_uni_w_v

---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  51 ++
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 710 ++++++++++++++++++++++
 2 files changed, 761 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index be1049a2ec..6b5341dd45 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -128,6 +128,52 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co
                                          ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
                                          mx, intptr_t my, int width);
 
+#define NEON8_FNPROTO(fn, args, ext) \
+    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##6_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##12_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##24_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##48_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+
+#define NEON8_FNPROTO_PARTIAL_4(fn, args, ext) \
+    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+
+NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width),);
+
+NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width),);
+
+#define NEON8_FNASSIGN(member, v, h, fn, ext) \
+        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
+        member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext;  \
+        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[4][v][h] = ff_hevc_put_hevc_##fn##12_8_neon##ext; \
+        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[6][v][h] = ff_hevc_put_hevc_##fn##24_8_neon##ext; \
+        member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
+        member[8][v][h] = ff_hevc_put_hevc_##fn##48_8_neon##ext; \
+        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
+
+#define NEON8_FNASSIGN_PARTIAL_4(member, v, h, fn, ext) \
+        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
+        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[7][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
+        member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
+        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
+
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
     if (!have_neon(av_get_cpu_flags())) return;
@@ -185,6 +231,11 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->put_hevc_qpel_bi[7][0][1]   =
         c->put_hevc_qpel_bi[8][0][1]   =
         c->put_hevc_qpel_bi[9][0][1]   = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
+
+        NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
+        NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
+
     }
     if (bit_depth == 10) {
         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 0e7b912678..4783bf174b 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -30,6 +30,13 @@ const qpel_filters, align=4
         .byte           0,  1, -5, 17, 58,-10, 4, -1
 endconst
 
+const qpel_filters_abs, align=4
+        .byte           0,  0,  0,  0,  0,  0, 0,  0
+        .byte           1,  4, 10, 58, 17,  5, 1,  0
+        .byte           1,  4, 11, 40, 40, 11, 4,  1
+        .byte           0,  1,  5, 17, 58, 10, 4,  1
+endconst
+
 .macro load_filter m
         movrel          x15, qpel_filters
         add             x15, x15, \m, lsl #3
@@ -482,3 +489,706 @@ endfunc
 put_hevc qpel
 put_hevc qpel_uni
 put_hevc qpel_bi
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ldr     s0, [x2]
+        ldr     s1, [x2, x3]
+        add     x2, x2, x3, lsl #1
+        ushll   v0.8h, v0.8b, #6
+        ushll   v1.8h, v1.8b, #6
+        smull   v0.4s, v0.4h, v30.4h
+        smull   v1.4s, v1.4h, v30.4h
+        sqrshl  v0.4s, v0.4s, v31.4s
+        sqrshl  v1.4s, v1.4s, v31.4s
+        sqadd   v0.4s, v0.4s, v29.4s
+        sqadd   v1.4s, v1.4s, v29.4s
+        sqxtn  v0.4h, v0.4s
+        sqxtn  v1.4h, v1.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        str     s0, [x0]
+        str     s1, [x0, x1]
+        add     x0, x0, x1, lsl #1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels6_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+        sub     x1, x1, #4
+1:
+        ldr     d0, [x2]
+        ldr     d1, [x2, x3]
+        add     x2, x2, x3, lsl #1
+        ushll   v0.8h, v0.8b, #6
+        ushll   v1.8h, v1.8b, #6
+        smull   v4.4s, v0.4h, v30.4h
+        smull2  v5.4s, v0.8h, v30.8h
+        smull   v6.4s, v1.4h, v30.4h
+        smull2  v7.4s, v1.8h, v30.8h
+        sqrshl  v4.4s, v4.4s, v31.4s
+        sqrshl  v5.4s, v5.4s, v31.4s
+        sqrshl  v6.4s, v6.4s, v31.4s
+        sqrshl  v7.4s, v7.4s, v31.4s
+        sqadd   v4.4s, v4.4s, v29.4s
+        sqadd   v5.4s, v5.4s, v29.4s
+        sqadd   v6.4s, v6.4s, v29.4s
+        sqadd   v7.4s, v7.4s, v29.4s
+        sqxtn   v0.4h, v4.4s
+        sqxtn2  v0.8h, v5.4s
+        sqxtn   v1.4h, v6.4s
+        sqxtn2  v1.8h, v7.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        str     s0, [x0], #4
+        st1     {v0.h}[2], [x0], x1
+        str     s1, [x0], #4
+        st1     {v1.h}[2], [x0], x1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ldr     d0, [x2]
+        ldr     d1, [x2, x3]
+        add     x2, x2, x3, lsl #1
+        ushll   v0.8h, v0.8b, #6
+        ushll   v1.8h, v1.8b, #6
+        smull   v4.4s, v0.4h, v30.4h
+        smull2  v5.4s, v0.8h, v30.8h
+        smull   v6.4s, v1.4h, v30.4h
+        smull2  v7.4s, v1.8h, v30.8h
+        sqrshl  v4.4s, v4.4s, v31.4s
+        sqrshl  v5.4s, v5.4s, v31.4s
+        sqrshl  v6.4s, v6.4s, v31.4s
+        sqrshl  v7.4s, v7.4s, v31.4s
+        sqadd   v4.4s, v4.4s, v29.4s
+        sqadd   v5.4s, v5.4s, v29.4s
+        sqadd   v6.4s, v6.4s, v29.4s
+        sqadd   v7.4s, v7.4s, v29.4s
+        sqxtn   v0.4h, v4.4s
+        sqxtn2  v0.8h, v5.4s
+        sqxtn   v1.4h, v6.4s
+        sqxtn2  v1.8h, v7.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        str     d0, [x0]
+        str     d1, [x0, x1]
+        add     x0, x0, x1, lsl #1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels12_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+        sub     x1, x1, #8
+1:
+        ldr     q0, [x2]
+        ldr     q1, [x2, x3]
+        add     x2, x2, x3, lsl #1
+        ushll   v4.8h, v0.8b, #6
+        ushll2  v5.8h, v0.16b, #6
+        ushll   v6.8h, v1.8b, #6
+        ushll2  v7.8h, v1.16b, #6
+        smull   v16.4s, v4.4h, v30.4h
+        smull2  v17.4s, v4.8h, v30.8h
+        smull   v18.4s, v5.4h, v30.4h
+        smull2  v19.4s, v5.8h, v30.8h
+        smull   v20.4s, v6.4h, v30.4h
+        smull2  v21.4s, v6.8h, v30.8h
+        smull   v22.4s, v7.4h, v30.4h
+        smull2  v23.4s, v7.8h, v30.8h
+
+        sqrshl  v16.4s, v16.4s, v31.4s
+        sqrshl  v17.4s, v17.4s, v31.4s
+        sqrshl  v18.4s, v18.4s, v31.4s
+        sqrshl  v19.4s, v19.4s, v31.4s
+        sqrshl  v20.4s, v20.4s, v31.4s
+        sqrshl  v21.4s, v21.4s, v31.4s
+        sqrshl  v22.4s, v22.4s, v31.4s
+        sqrshl  v23.4s, v23.4s, v31.4s
+        sqadd   v16.4s, v16.4s, v29.4s
+        sqadd   v17.4s, v17.4s, v29.4s
+        sqadd   v18.4s, v18.4s, v29.4s
+        sqadd   v19.4s, v19.4s, v29.4s
+        sqadd   v20.4s, v20.4s, v29.4s
+        sqadd   v21.4s, v21.4s, v29.4s
+        sqadd   v22.4s, v22.4s, v29.4s
+        sqadd   v23.4s, v23.4s, v29.4s
+        sqxtn   v0.4h, v16.4s
+        sqxtn2  v0.8h, v17.4s
+        sqxtn   v1.4h, v18.4s
+        sqxtn2  v1.8h, v19.4s
+        sqxtn   v2.4h, v20.4s
+        sqxtn2  v2.8h, v21.4s
+        sqxtn   v3.4h, v22.4s
+        sqxtn2  v3.8h, v23.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun2 v0.16b, v1.8h
+        sqxtun  v2.8b, v2.8h
+        sqxtun2 v2.16b, v3.8h
+        str     d0, [x0], #8
+        st1     {v0.s}[2], [x0], x1
+        str     d2, [x0], #8
+        st1     {v2.s}[2], [x0], x1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+.macro PEL_UNI_W_PIXEL_CALC     s0, t0, t1, d0, d1, d2, d3
+        ushll   \t0\().8h, \s0\().8b, #6
+        ushll2  \t1\().8h, \s0\().16b, #6
+        smull   \d0\().4s, \t0\().4h, v30.4h
+        smull2  \d1\().4s, \t0\().8h, v30.8h
+        smull   \d2\().4s, \t1\().4h, v30.4h
+        smull2  \d3\().4s, \t1\().8h, v30.8h
+        sqrshl  \d0\().4s, \d0\().4s, v31.4s
+        sqrshl  \d1\().4s, \d1\().4s, v31.4s
+        sqrshl  \d2\().4s, \d2\().4s, v31.4s
+        sqrshl  \d3\().4s, \d3\().4s, v31.4s
+        sqadd   \d0\().4s, \d0\().4s, v29.4s
+        sqadd   \d1\().4s, \d1\().4s, v29.4s
+        sqadd   \d2\().4s, \d2\().4s, v29.4s
+        sqadd   \d3\().4s, \d3\().4s, v29.4s
+        sqxtn   \t0\().4h, \d0\().4s
+        sqxtn2  \t0\().8h, \d1\().4s
+        sqxtn   \t1\().4h, \d2\().4s
+        sqxtn2  \t1\().8h, \d3\().4s
+        sqxtun  \s0\().8b,  \t0\().8h
+        sqxtun2 \s0\().16b, \t1\().8h
+.endm
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels16_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ldr     q0, [x2]
+        ldr     q1, [x2, x3]
+        add     x2, x2, x3, lsl #1
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        str     q0, [x0]
+        str     q1, [x0, x1]
+        add     x0, x0, x1, lsl #1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels24_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ld1     {v0.16b, v1.16b}, [x2], x3
+        ushll   v4.8h, v0.8b, #6
+        ushll2  v5.8h, v0.16b, #6
+        ushll   v6.8h, v1.8b, #6
+        smull   v16.4s, v4.4h, v30.4h
+        smull2  v17.4s, v4.8h, v30.8h
+        smull   v18.4s, v5.4h, v30.4h
+        smull2  v19.4s, v5.8h, v30.8h
+        smull   v20.4s, v6.4h, v30.4h
+        smull2  v21.4s, v6.8h, v30.8h
+        sqrshl  v16.4s, v16.4s, v31.4s
+        sqrshl  v17.4s, v17.4s, v31.4s
+        sqrshl  v18.4s, v18.4s, v31.4s
+        sqrshl  v19.4s, v19.4s, v31.4s
+        sqrshl  v20.4s, v20.4s, v31.4s
+        sqrshl  v21.4s, v21.4s, v31.4s
+        sqadd   v16.4s, v16.4s, v29.4s
+        sqadd   v17.4s, v17.4s, v29.4s
+        sqadd   v18.4s, v18.4s, v29.4s
+        sqadd   v19.4s, v19.4s, v29.4s
+        sqadd   v20.4s, v20.4s, v29.4s
+        sqadd   v21.4s, v21.4s, v29.4s
+        sqxtn   v0.4h, v16.4s
+        sqxtn2  v0.8h, v17.4s
+        sqxtn   v1.4h, v18.4s
+        sqxtn2  v1.8h, v19.4s
+        sqxtn   v2.4h, v20.4s
+        sqxtn2  v2.8h, v21.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        sqxtun  v2.8b, v2.8h
+        st1     {v0.8b, v1.8b, v2.8b}, [x0], x1
+        subs    w4, w4, #1
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels32_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ld1     {v0.16b, v1.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        st1     {v0.16b, v1.16b}, [x0], x1
+        subs    w4, w4, #1
+        b.ne    1b
+        ret
+endfunc
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels48_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ld1     {v0.16b, v1.16b, v2.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
+        st1     {v0.16b, v1.16b, v2.16b}, [x0], x1
+        subs    w4, w4, #1
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels64_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ld1     {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v3, v6, v7, v20, v21, v22, v23
+        st1     {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+        subs    w4, w4, #1
+        b.ne    1b
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_HEADER
+        ldur            x12, [sp, #8]          // my
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        movrel          x9, qpel_filters_abs
+        add             x9, x9, x12, lsl #3
+        ldr             d28, [x9]
+        dup             v0.16b, v28.b[0]
+        dup             v1.16b, v28.b[1]
+        dup             v2.16b, v28.b[2]
+        dup             v3.16b, v28.b[3]
+        dup             v4.16b, v28.b[4]
+        dup             v5.16b, v28.b[5]
+        dup             v6.16b, v28.b[6]
+        dup             v7.16b, v28.b[7]
+
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6              // wx
+        dup             v31.4s, w10             // shift
+        dup             v29.4s, w7              // ox
+.endm
+
+.macro QPEL_FILTER_B dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umull           \dst\().8h, \src1\().8b, v1.8b
+        umlsl           \dst\().8h, \src0\().8b, v0.8b
+        umlsl           \dst\().8h, \src2\().8b, v2.8b
+        umlal           \dst\().8h, \src3\().8b, v3.8b
+        umlal           \dst\().8h, \src4\().8b, v4.8b
+        umlsl           \dst\().8h, \src5\().8b, v5.8b
+        umlal           \dst\().8h, \src6\().8b, v6.8b
+        umlsl           \dst\().8h, \src7\().8b, v7.8b
+.endm
+
+.macro QPEL_FILTER_B2 dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umull2          \dst\().8h, \src1\().16b, v1.16b
+        umlsl2          \dst\().8h, \src0\().16b, v0.16b
+        umlsl2          \dst\().8h, \src2\().16b, v2.16b
+        umlal2          \dst\().8h, \src3\().16b, v3.16b
+        umlal2          \dst\().8h, \src4\().16b, v4.16b
+        umlsl2          \dst\().8h, \src5\().16b, v5.16b
+        umlal2          \dst\().8h, \src6\().16b, v6.16b
+        umlsl2          \dst\().8h, \src7\().16b, v7.16b
+.endm
+
+.macro  QPEL_UNI_W_V_4
+        smull           v24.4s, v24.4h, v30.4h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtun          v24.8b, v24.8h
+        st1             {v24.s}[0], [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v4_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             s16, [x2]
+        ldr             s17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s18, [x2]
+        ldr             s19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s20, [x2]
+        ldr             s21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s22, [x2]
+
+1:      ldr             s23, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v24, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s16, [x2]
+        QPEL_FILTER_B     v24, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v24, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s18, [x2]
+        QPEL_FILTER_B     v24, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v24, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s20, [x2]
+        QPEL_FILTER_B     v24, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v24, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s22, [x2]
+        QPEL_FILTER_B     v24, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_8
+        smull           v24.4s, v26.4h, v30.4h
+        smull2          v25.4s, v26.8h, v30.8h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqrshl          v25.4s, v25.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqadd           v25.4s, v25.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtn2          v24.8h, v25.4s
+        sqxtun          v24.8b, v24.8h
+        st1             {v24.d}[0], [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v8_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             d16, [x2]
+        ldr             d17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d18, [x2]
+        ldr             d19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d20, [x2]
+        ldr             d21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d22, [x2]
+
+1:      ldr             d23, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d16, [x2]
+        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d18, [x2]
+        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d20, [x2]
+        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d22, [x2]
+        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_16
+        smull           v24.4s, v26.4h, v30.4h
+        smull2          v25.4s, v26.8h, v30.8h
+        smull           v26.4s, v27.4h, v30.4h
+        smull2          v27.4s, v27.8h, v30.8h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqrshl          v25.4s, v25.4s, v31.4s
+        sqrshl          v26.4s, v26.4s, v31.4s
+        sqrshl          v27.4s, v27.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqadd           v25.4s, v25.4s, v29.4s
+        sqadd           v26.4s, v26.4s, v29.4s
+        sqadd           v27.4s, v27.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtn2          v24.8h, v25.4s
+        sqxtn           v26.4h, v26.4s
+        sqxtn2          v26.8h, v27.4s
+        sqxtun          v24.8b, v24.8h
+        sqxtun2         v24.16b, v26.8h
+        st1             {v24.16b}, [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v16_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             q16, [x2]
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q18, [x2]
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q20, [x2]
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q22, [x2]
+
+1:      ldr             q23, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_B2    v27, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q16, [x2]
+        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_B2    v27, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_B2    v27, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q18, [x2]
+        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_B2    v27, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_B2    v27, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q20, [x2]
+        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_B2    v27, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_B2    v27, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q22, [x2]
+        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_B2    v27, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldur            w13, [sp, #16]
+        mov             x14, x0
+        mov             x15, x2
+        mov             w11, w4
+
+3:
+        ldr             q16, [x2]
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q18, [x2]
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q20, [x2]
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q22, [x2]
+
+
+1:      ldr             q23, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_B2    v27, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q16, [x2]
+        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_B2    v27, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_B2    v27, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q18, [x2]
+        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_B2    v27, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_B2    v27, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q20, [x2]
+        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_B2    v27, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_B2    v27, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q22, [x2]
+        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_B2    v27, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        subs            w13, w13, #16
+        add             x14, x14, #16
+        add             x15, x15, #16
+        mov             x0, x14
+        mov             x2, x15
+        mov             w4, w11
+        b.hi            3b
+        ret
+endfunc
Logan.Lyu May 27, 2023, 8:34 a.m. UTC | #8
Hi, Martin,

I saw your new opinions. Do you mean that the code of my current patch 
should be guard as follows?

C code:

/if (have_i8mm(cpu_flags)) {//
//}/

/asm code :/

/#if HAVE_I8MM/

/#endif/


I mean my current code base does not have those definitions, should I 
implement them directly as if they already exist?


other opinions is under modification..


在 2023/5/26 16:34, Martin Storsjö 写道:
> Hi,
>
> Overall these patches seem mostly ok, but I've got a few minor points 
> to make:
>
> - The usdot instruction requires the i8mm extension (part of 
> armv8.6-a), while udot or sdot would require the dotprod extension 
> (available in armv8.4-a). If you could manage with udot or sdot, these 
> functions would be usable on a wider set of CPUs.
>
> Therefore, the current guards are wrong. Also, I finally got support 
> implemented for optionally using these cpu extensions, even if the 
> baseline of the compile don't include it, by runtime enabling it. See 
> the patchset at 
> https://patchwork.ffmpeg.org/project/ffmpeg/list/?series=9009.
>
> To adapt your patches on top of this, see the two topmost commits at 
> https://github.com/mstorsjo/ffmpeg/commits/archext.
>
> - The indentation is inconsistent; in the first patch, you have some 
> instructions written like this:
>
> +        sqadd   v1.4s, v1.4s, v29.4s
>
> While you later use this style:
>
> +        dup             v1.16b, v28.b[1]
>
> The latter seems to match the style we commonly use; please reformat 
> your code to match that consistently.
>
> With some macro invocations in the first patch, you also seem to have 
> too much indentation in some places. See e.g. this:
>
> +1:      ldr             q23, [x2, x3]
> +        add             x2, x2, x3, lsl #1
> +        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
> +        QPEL_FILTER_B2    v27, v16, v17, v18, v19, v20, v21, v22, v23
> +        QPEL_UNI_W_V_16
> +        subs            w4, w4, #1
> +        b.eq            2f
>
> (If the macro name is too long, that's ok, but here there's no need to 
> have those lines unaligned.)
>
> - In the third patch, you've got multiple parameters from the stack 
> like this:
>
> +        ldp             x14, x15, [sp]          // mx, my
> +        ldr             w13, [sp, #16]          // width
>
> I see that the mx an my parameters are intptr_t; that's good, since if 
> they would be 32 bit integers, the ABI for such parameters on the 
> stack differ between macOS/Darwin and Linux. But as long as they're 
> intptr_t they behave the same.
>
> - At the same place, you're backing up a bunch of registers:
>
> +        stp             x20, x21, [sp, #-16]!
> +        stp             x22, x23, [sp, #-16]!
> +        stp             x24, x25, [sp, #-16]!
> +        stp             x26, x27, [sp, #-16]!
> +        stp             x28, x30, [sp, #-16]!
>
> This is inefficient; instead, do this:
>
> +        stp             x28, x30, [sp, #-80]!
> +        stp             x20, x21, [sp, #16]
> +        stp             x22, x23, [sp, #32]
> +        stp             x24, x25, [sp, #48]
> +        stp             x26, x27, [sp, #64]
>
> Also, following that, I see that you back up the stack pointer in x28. 
> Why do you use x28 for that? Using x29 would be customary as frame 
> pointer.
>
> Aside for that, I think the rest of the patches is acceptable.
>
> // Martin
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Martin Storsjö May 27, 2023, 8:24 p.m. UTC | #9
Hi,

On Sat, 27 May 2023, myais wrote:

> I saw your new opinions. Do you mean that the code of my current patch 
> should be guard as follows?
>
> C code:
>
> /if (have_i8mm(cpu_flags)) {//
> //}/
>
> /asm code :/
>
> /#if HAVE_I8MM/
>
> /#endif/

Yes

> I mean my current code base does not have those definitions, should I 
> implement them directly as if they already exist?

I suggest you apply my patches from the patchset I pointed you to, 
https://patchwork.ffmpeg.org/project/ffmpeg/list/?series=9009 - or from 
the start of the branch on github I pointed you to. It's likely that the 
patches will change a little before they're accepted, but I believe the 
main interface towards your code, "#if HAVE_I8MM" and "if 
(have_i8mm(cpu_flags))" will remain as such.

// Martin
Logan.Lyu May 28, 2023, 3:23 a.m. UTC | #10
Hi, Martin

I have finished the modification, please review again.

Thanks.


在 2023/5/26 16:34, Martin Storsjö 写道:
> Hi,
>
> Overall these patches seem mostly ok, but I've got a few minor points 
> to make:
>
> - The usdot instruction requires the i8mm extension (part of 
> armv8.6-a), while udot or sdot would require the dotprod extension 
> (available in armv8.4-a). If you could manage with udot or sdot, these 
> functions would be usable on a wider set of CPUs.
>
> Therefore, the current guards are wrong. Also, I finally got support 
> implemented for optionally using these cpu extensions, even if the 
> baseline of the compile don't include it, by runtime enabling it. See 
> the patchset at 
> https://patchwork.ffmpeg.org/project/ffmpeg/list/?series=9009.
>
> To adapt your patches on top of this, see the two topmost commits at 
> https://github.com/mstorsjo/ffmpeg/commits/archext.
Fixed.
> - The indentation is inconsistent; in the first patch, you have some 
> instructions written like this:
>
> +        sqadd   v1.4s, v1.4s, v29.4s
>
> While you later use this style:
>
> +        dup             v1.16b, v28.b[1]
>
> The latter seems to match the style we commonly use; please reformat 
> your code to match that consistently.
>
> With some macro invocations in the first patch, you also seem to have 
> too much indentation in some places. See e.g. this:
>
> +1:      ldr             q23, [x2, x3]
> +        add             x2, x2, x3, lsl #1
> +        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
> +        QPEL_FILTER_B2    v27, v16, v17, v18, v19, v20, v21, v22, v23
> +        QPEL_UNI_W_V_16
> +        subs            w4, w4, #1
> +        b.eq            2f
>
> (If the macro name is too long, that's ok, but here there's no need to 
> have those lines unaligned.)
Fixed.
>
> - In the third patch, you've got multiple parameters from the stack 
> like this:
>
> +        ldp             x14, x15, [sp]          // mx, my
> +        ldr             w13, [sp, #16]          // width
>
> I see that the mx an my parameters are intptr_t; that's good, since if 
> they would be 32 bit integers, the ABI for such parameters on the 
> stack differ between macOS/Darwin and Linux. But as long as they're 
> intptr_t they behave the same.
>
> - At the same place, you're backing up a bunch of registers:
>
> +        stp             x20, x21, [sp, #-16]!
> +        stp             x22, x23, [sp, #-16]!
> +        stp             x24, x25, [sp, #-16]!
> +        stp             x26, x27, [sp, #-16]!
> +        stp             x28, x30, [sp, #-16]!
>
> This is inefficient; instead, do this:
>
> +        stp             x28, x30, [sp, #-80]!
> +        stp             x20, x21, [sp, #16]
> +        stp             x22, x23, [sp, #32]
> +        stp             x24, x25, [sp, #48]
> +        stp             x26, x27, [sp, #64]
>
> Also, following that, I see that you back up the stack pointer in x28. 
> Why do you use x28 for that? Using x29 would be customary as frame 
> pointer.
Using more efficient implementation now.  And x28 in this case is a 
common callee save register. Anyway, I use x19 instead now.
>
> Aside for that, I think the rest of the patches is acceptable.
>
> // Martin
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
From 224e1b907b9273f6fecaef007730bd1168493515 Mon Sep 17 00:00:00 2001
From: myais <Logan.Lyu@myais.com.cn>
Date: Fri, 5 May 2023 22:06:22 +0800
Subject: [PATCH 2/3] lavc/aarch64: new optimization for 8-bit
 hevc_qpel_uni_w_h

---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  15 +-
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 434 ++++++++++++++++++++++
 2 files changed, 448 insertions(+), 1 deletion(-)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 6b5341dd45..a7e62c7d15 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -145,6 +145,7 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co
     void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
     void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
 
+
 NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, int denom, int wx, int ox,
@@ -155,6 +156,12 @@ NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
         int height, int denom, int wx, int ox,
         intptr_t mx, intptr_t my, int width),);
 
+NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width), _i8mm);
+
+
 #define NEON8_FNASSIGN(member, v, h, fn, ext) \
         member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
         member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext;  \
@@ -174,9 +181,11 @@ NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
         member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
         member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
 
+
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
-    if (!have_neon(av_get_cpu_flags())) return;
+    int cpu_flags = av_get_cpu_flags();
+    if (!have_neon(cpu_flags)) return;
 
     if (bit_depth == 8) {
         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_8_neon;
@@ -236,6 +245,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
         NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
 
+        if (have_i8mm(cpu_flags)) {
+            NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
+        }
+
     }
     if (bit_depth == 10) {
         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 51df52e1ea..8e8b88c9ea 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -1192,3 +1192,437 @@ function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1
         b.hi            3b
         ret
 endfunc
+
+#if HAVE_I8MM
+.macro QPEL_UNI_W_H_HEADER
+        ldr             x12, [sp]
+        sub             x2, x2, #3
+        movrel          x9, qpel_filters
+        add             x9, x9, x12, lsl #3
+        ldr             x11, [x9]
+        dup             v28.2d, x11
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.4s, w6              // wx
+        dup             v31.4s, w10             // shift
+        dup             v29.4s, w7              // ox
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_h4_8_neon_i8mm, export=1
+        QPEL_UNI_W_H_HEADER
+1:
+        ld1             {v0.16b}, [x2], x3
+        ext             v1.16b, v0.16b, v0.16b, #1
+        ext             v2.16b, v0.16b, v0.16b, #2
+        ext             v3.16b, v0.16b, v0.16b, #3
+        zip1            v0.2d, v0.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        usdot           v16.4s, v0.16b, v28.16b
+        usdot           v17.4s, v2.16b, v28.16b
+        addp            v16.4s, v16.4s, v17.4s
+        mul             v16.4s, v16.4s, v30.4s
+        sqrshl          v16.4s, v16.4s, v31.4s
+        sqadd           v16.4s, v16.4s, v29.4s
+        sqxtn           v16.4h, v16.4s
+        sqxtun          v16.8b, v16.8h
+        str             s16, [x0]
+        add             x0, x0, x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_h6_8_neon_i8mm, export=1
+        QPEL_UNI_W_H_HEADER
+        sub             x1, x1, #4
+1:
+        ld1             {v0.16b}, [x2], x3
+        ext             v1.16b, v0.16b, v0.16b, #1
+        ext             v2.16b, v0.16b, v0.16b, #2
+        ext             v3.16b, v0.16b, v0.16b, #3
+        ext             v4.16b, v0.16b, v0.16b, #4
+        ext             v5.16b, v0.16b, v0.16b, #5
+        zip1            v0.2d, v0.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        zip1            v4.2d, v4.2d, v5.2d
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        usdot           v16.4s, v0.16b, v28.16b
+        usdot           v17.4s, v2.16b, v28.16b
+        usdot           v18.4s, v4.16b, v28.16b
+        addp            v16.4s, v16.4s, v17.4s
+        addp            v18.4s, v18.4s, v18.4s
+        mul             v16.4s, v16.4s, v30.4s
+        mul             v18.2s, v18.2s, v30.2s
+        sqrshl          v16.4s, v16.4s, v31.4s
+        sqrshl          v18.2s, v18.2s, v31.2s
+        sqadd           v16.4s, v16.4s, v29.4s
+        sqadd           v18.2s, v18.2s, v29.2s
+        sqxtn           v16.4h, v16.4s
+        sqxtn2          v16.8h, v18.4s
+        sqxtun          v16.8b, v16.8h
+        str             s16, [x0], #4
+        st1             {v16.h}[2], [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+
+.macro  QPEL_UNI_W_H_CALC s0, s1, s2, s3, d0, d1, d2, d3
+        movi            \d0\().2d, #0
+        movi            \d1\().2d, #0
+        movi            \d2\().2d, #0
+        movi            \d3\().2d, #0
+        usdot           \d0\().4s, \s0\().16b, v28.16b
+        usdot           \d1\().4s, \s1\().16b, v28.16b
+        usdot           \d2\().4s, \s2\().16b, v28.16b
+        usdot           \d3\().4s, \s3\().16b, v28.16b
+        addp            \d0\().4s, \d0\().4s, \d1\().4s
+        addp            \d2\().4s, \d2\().4s, \d3\().4s
+        mul             \d0\().4s, \d0\().4s, v30.4s
+        mul             \d2\().4s, \d2\().4s, v30.4s
+        sqrshl          \d0\().4s, \d0\().4s, v31.4s
+        sqrshl          \d2\().4s, \d2\().4s, v31.4s
+        sqadd           \d0\().4s, \d0\().4s, v29.4s
+        sqadd           \d2\().4s, \d2\().4s, v29.4s
+.endm
+
+.macro  QPEL_UNI_W_H_CALC_HALF s0, s1, d0, d1
+        movi            \d0\().2d, #0
+        movi            \d1\().2d, #0
+        usdot           \d0\().4s, \s0\().16b, v28.16b
+        usdot           \d1\().4s, \s1\().16b, v28.16b
+        addp            \d0\().4s, \d0\().4s, \d1\().4s
+        mul             \d0\().4s, \d0\().4s, v30.4s
+        sqrshl          \d0\().4s, \d0\().4s, v31.4s
+        sqadd           \d0\().4s, \d0\().4s, v29.4s
+.endm
+
+
+function ff_hevc_put_hevc_qpel_uni_w_h8_8_neon_i8mm, export=1
+        QPEL_UNI_W_H_HEADER
+1:
+        ld1             {v16.16b, v17.16b}, [x2], x3
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        zip1            v0.2d, v16.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        zip1            v4.2d, v4.2d, v5.2d
+        zip1            v6.2d, v6.2d, v7.2d
+        QPEL_UNI_W_H_CALC  v0, v2, v4, v6,  v18, v19, v20, v21
+        sqxtn           v18.4h, v18.4s
+        sqxtn2          v18.8h, v20.4s
+        sqxtun          v18.8b, v18.8h
+        str             d18, [x0]
+        add             x0, x0, x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_h12_8_neon_i8mm, export=1
+        QPEL_UNI_W_H_HEADER
+        add             x13, x0, #8
+1:
+        ld1             {v16.16b, v17.16b}, [x2], x3
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        zip1            v18.2d, v16.2d, v1.2d
+        zip1            v19.2d, v2.2d, v3.2d
+        zip1            v20.2d, v4.2d, v5.2d
+        zip1            v21.2d, v6.2d, v7.2d
+        zip2            v22.2d, v16.2d, v1.2d
+        zip2            v23.2d, v2.2d, v3.2d
+        QPEL_UNI_W_H_CALC  v18, v19, v20, v21, v0, v2, v4, v6
+        QPEL_UNI_W_H_CALC_HALF v22, v23, v24, v25
+        sqxtn           v0.4h, v0.4s
+        sqxtn2          v0.8h, v4.4s
+        sqxtn           v1.4h, v24.4s
+        sqxtun          v0.8b, v0.8h
+        sqxtun          v1.8b, v1.8h
+
+        str             d0, [x0]
+        str             s1, [x13]
+        add             x0, x0, x1
+        add             x13, x13, x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_h16_8_neon_i8mm, export=1
+        QPEL_UNI_W_H_HEADER
+1:
+        ld1             {v16.16b, v17.16b}, [x2], x3
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v18, v19, v20, v21   // v18: 0, 8, 2, 10 v20: 1, 9, 3, 11
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25    // v22: 4, 12, 6, 14 v24: 5, 13, 7, 15
+        sqxtn           v0.4h, v18.4s
+        sqxtn2          v0.8h, v22.4s
+        sqxtn           v1.4h, v20.4s
+        sqxtn2          v1.8h, v24.4s
+        trn1            v2.8h, v0.8h, v1.8h
+        trn2            v3.8h, v0.8h, v1.8h
+        sqxtun          v0.8b, v2.8h
+        sqxtun2         v0.16b, v3.8h
+        st1             {v0.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_h24_8_neon_i8mm, export=1
+        QPEL_UNI_W_H_HEADER
+        sub             x1, x1, #16
+1:
+        ld1             {v16.16b, v17.16b}, [x2], x3
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v18, v19, v20, v21
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25
+        sqxtn           v18.4h, v18.4s
+        sqxtn2          v18.8h, v22.4s
+        sqxtn           v19.4h, v20.4s
+        sqxtn2          v19.8h, v24.4s
+        trn1            v20.8h, v18.8h, v19.8h
+        trn2            v21.8h, v18.8h, v19.8h
+        sqxtun          v26.8b, v20.8h
+        sqxtun2         v26.16b, v21.8h                         // 0-15
+        ext             v1.16b, v17.16b, v17.16b, #1
+        ext             v2.16b, v17.16b, v17.16b, #2
+        ext             v3.16b, v17.16b, v17.16b, #3
+        ext             v4.16b, v17.16b, v17.16b, #4
+        ext             v5.16b, v17.16b, v17.16b, #5
+        ext             v6.16b, v17.16b, v17.16b, #6
+        ext             v7.16b, v17.16b, v17.16b, #7
+        zip1            v0.2d, v17.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        zip1            v4.2d, v4.2d, v5.2d
+        zip1            v6.2d, v6.2d, v7.2d
+        QPEL_UNI_W_H_CALC  v0, v2, v4, v6, v18, v19, v20, v21
+        sqxtn           v18.4h, v18.4s
+        sqxtn2          v18.8h, v20.4s
+        sqxtun          v27.8b, v18.8h
+
+        st1             {v26.16b}, [x0], #16
+        st1             {v27.8b}, [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+
+function ff_hevc_put_hevc_qpel_uni_w_h32_8_neon_i8mm, export=1
+        QPEL_UNI_W_H_HEADER
+1:
+        ld1             {v16.16b, v17.16b, v18.16b}, [x2], x3
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v0, v19, v20, v21
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25
+        sqxtn           v0.4h, v0.4s
+        sqxtn2          v0.8h, v22.4s
+        sqxtn           v19.4h, v20.4s
+        sqxtn2          v19.8h, v24.4s
+        trn1            v20.8h, v0.8h, v19.8h
+        trn2            v21.8h, v0.8h, v19.8h
+        sqxtun          v26.8b, v20.8h
+        sqxtun2         v26.16b, v21.8h                         // 0-15
+        ext             v1.16b, v17.16b, v18.16b, #1
+        ext             v2.16b, v17.16b, v18.16b, #2
+        ext             v3.16b, v17.16b, v18.16b, #3
+        ext             v4.16b, v17.16b, v18.16b, #4
+        ext             v5.16b, v17.16b, v18.16b, #5
+        ext             v6.16b, v17.16b, v18.16b, #6
+        ext             v7.16b, v17.16b, v18.16b, #7
+        QPEL_UNI_W_H_CALC  v17, v2, v1, v3, v0, v19, v20, v21
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25
+        sqxtn           v0.4h, v0.4s
+        sqxtn2          v0.8h, v22.4s
+        sqxtn           v19.4h, v20.4s
+        sqxtn2          v19.8h, v24.4s
+        trn1            v20.8h, v0.8h, v19.8h
+        trn2            v21.8h, v0.8h, v19.8h
+        sqxtun          v27.8b, v20.8h
+        sqxtun2         v27.16b, v21.8h                         // 16-31
+        st1             {v26.16b, v27.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_h48_8_neon_i8mm, export=1
+        QPEL_UNI_W_H_HEADER
+1:
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v25.8b, v22.8h
+        sqxtun2         v25.16b, v23.8h                         // 0-15
+        ext             v1.16b, v17.16b, v18.16b, #1
+        ext             v2.16b, v17.16b, v18.16b, #2
+        ext             v3.16b, v17.16b, v18.16b, #3
+        ext             v4.16b, v17.16b, v18.16b, #4
+        ext             v5.16b, v17.16b, v18.16b, #5
+        ext             v6.16b, v17.16b, v18.16b, #6
+        ext             v7.16b, v17.16b, v18.16b, #7
+        QPEL_UNI_W_H_CALC  v17, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v26.8b, v22.8h
+        sqxtun2         v26.16b, v23.8h                         // 16-31
+        ext             v1.16b, v18.16b, v19.16b, #1
+        ext             v2.16b, v18.16b, v19.16b, #2
+        ext             v3.16b, v18.16b, v19.16b, #3
+        ext             v4.16b, v18.16b, v19.16b, #4
+        ext             v5.16b, v18.16b, v19.16b, #5
+        ext             v6.16b, v18.16b, v19.16b, #6
+        ext             v7.16b, v18.16b, v19.16b, #7
+        QPEL_UNI_W_H_CALC  v18, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v27.8b, v22.8h
+        sqxtun2         v27.16b, v23.8h                         // 32-47
+        st1             {v25.16b, v26.16b, v27.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_qpel_uni_w_h64_8_neon_i8mm, export=1
+        QPEL_UNI_W_H_HEADER
+        sub             x3, x3, #64
+1:
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v16.8b, v22.8h
+        sqxtun2         v16.16b, v23.8h                         // 0-15
+        ext             v1.16b, v17.16b, v18.16b, #1
+        ext             v2.16b, v17.16b, v18.16b, #2
+        ext             v3.16b, v17.16b, v18.16b, #3
+        ext             v4.16b, v17.16b, v18.16b, #4
+        ext             v5.16b, v17.16b, v18.16b, #5
+        ext             v6.16b, v17.16b, v18.16b, #6
+        ext             v7.16b, v17.16b, v18.16b, #7
+        QPEL_UNI_W_H_CALC  v17, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v17.8b, v22.8h
+        sqxtun2         v17.16b, v23.8h                         // 16-31
+        ext             v1.16b, v18.16b, v19.16b, #1
+        ext             v2.16b, v18.16b, v19.16b, #2
+        ext             v3.16b, v18.16b, v19.16b, #3
+        ext             v4.16b, v18.16b, v19.16b, #4
+        ext             v5.16b, v18.16b, v19.16b, #5
+        ext             v6.16b, v18.16b, v19.16b, #6
+        ext             v7.16b, v18.16b, v19.16b, #7
+        QPEL_UNI_W_H_CALC  v18, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        ld1             {v0.16b}, [x2], x3
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v18.8b, v22.8h
+        sqxtun2         v18.16b, v23.8h                         // 32-47
+        ext             v1.16b, v19.16b, v0.16b, #1
+        ext             v2.16b, v19.16b, v0.16b, #2
+        ext             v3.16b, v19.16b, v0.16b, #3
+        ext             v4.16b, v19.16b, v0.16b, #4
+        ext             v5.16b, v19.16b, v0.16b, #5
+        ext             v6.16b, v19.16b, v0.16b, #6
+        ext             v7.16b, v19.16b, v0.16b, #7
+        QPEL_UNI_W_H_CALC  v19, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v19.8b, v22.8h
+        sqxtun2         v19.16b, v23.8h                         // 48-63
+
+        st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+#endif // HAVE_I8MM
Logan.Lyu May 28, 2023, 6:26 a.m. UTC | #11
在 2023/5/28 12:36, Jean-Baptiste Kempf 写道:
> Hello,
>
> The last interaction still has the wrong name in patchset.
Thanks for reminding.  I modified the correct name in git.
>
> jb
>
> On Sun, 28 May 2023, at 12:23, Logan.Lyu wrote:
>> Hi, Martin
>>
>> I have finished the modification, please review again.
>>
>> Thanks.
>>
>>
>> 在 2023/5/26 16:34, Martin Storsjö 写道:
>>> Hi,
>>>
>>> Overall these patches seem mostly ok, but I've got a few minor points
>>> to make:
>>>
>>> - The usdot instruction requires the i8mm extension (part of
>>> armv8.6-a), while udot or sdot would require the dotprod extension
>>> (available in armv8.4-a). If you could manage with udot or sdot, these
>>> functions would be usable on a wider set of CPUs.
>>>
>>> Therefore, the current guards are wrong. Also, I finally got support
>>> implemented for optionally using these cpu extensions, even if the
>>> baseline of the compile don't include it, by runtime enabling it. See
>>> the patchset at
>>> https://patchwork.ffmpeg.org/project/ffmpeg/list/?series=9009.
>>>
>>> To adapt your patches on top of this, see the two topmost commits at
>>> https://github.com/mstorsjo/ffmpeg/commits/archext.
>> Fixed.
>>> - The indentation is inconsistent; in the first patch, you have some
>>> instructions written like this:
>>>
>>> +        sqadd   v1.4s, v1.4s, v29.4s
>>>
>>> While you later use this style:
>>>
>>> +        dup             v1.16b, v28.b[1]
>>>
>>> The latter seems to match the style we commonly use; please reformat
>>> your code to match that consistently.
>>>
>>> With some macro invocations in the first patch, you also seem to have
>>> too much indentation in some places. See e.g. this:
>>>
>>> +1:      ldr             q23, [x2, x3]
>>> +        add             x2, x2, x3, lsl #1
>>> +        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
>>> +        QPEL_FILTER_B2    v27, v16, v17, v18, v19, v20, v21, v22, v23
>>> +        QPEL_UNI_W_V_16
>>> +        subs            w4, w4, #1
>>> +        b.eq            2f
>>>
>>> (If the macro name is too long, that's ok, but here there's no need to
>>> have those lines unaligned.)
>> Fixed.
>>> - In the third patch, you've got multiple parameters from the stack
>>> like this:
>>>
>>> +        ldp             x14, x15, [sp]          // mx, my
>>> +        ldr             w13, [sp, #16]          // width
>>>
>>> I see that the mx an my parameters are intptr_t; that's good, since if
>>> they would be 32 bit integers, the ABI for such parameters on the
>>> stack differ between macOS/Darwin and Linux. But as long as they're
>>> intptr_t they behave the same.
>>>
>>> - At the same place, you're backing up a bunch of registers:
>>>
>>> +        stp             x20, x21, [sp, #-16]!
>>> +        stp             x22, x23, [sp, #-16]!
>>> +        stp             x24, x25, [sp, #-16]!
>>> +        stp             x26, x27, [sp, #-16]!
>>> +        stp             x28, x30, [sp, #-16]!
>>>
>>> This is inefficient; instead, do this:
>>>
>>> +        stp             x28, x30, [sp, #-80]!
>>> +        stp             x20, x21, [sp, #16]
>>> +        stp             x22, x23, [sp, #32]
>>> +        stp             x24, x25, [sp, #48]
>>> +        stp             x26, x27, [sp, #64]
>>>
>>> Also, following that, I see that you back up the stack pointer in x28.
>>> Why do you use x28 for that? Using x29 would be customary as frame
>>> pointer.
>> Using more efficient implementation now.  And x28 in this case is a
>> common callee save register. Anyway, I use x19 instead now.
>>> Aside for that, I think the rest of the patches is acceptable.
>>>
>>> // Martin
>>>
>>> _______________________________________________
>>> ffmpeg-devel mailing list
>>> ffmpeg-devel@ffmpeg.org
>>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>>
>>> To unsubscribe, visit link above, or email
>>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>>
>> Attachments:
>> * 0002-lavc-aarch64-new-optimization-for-8-bit-hevc_qpel_un.patch
>> * 0003-lavc-aarch64-new-optimization-for-8-bit-hevc_qpel_h-.patch
>> * 0001-lavc-aarch64-new-optimization-for-8-bit-hevc_pel_uni.patch
From 8d5875ab393828b83163b98eb4b35837120f1322 Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu@myais.com.cn>
Date: Wed, 3 May 2023 09:53:07 +0800
Subject: [PATCH 1/3] lavc/aarch64: new optimization for 8-bit
 hevc_pel_uni_w_pixels and qpel_uni_w_v

---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  51 ++
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 710 ++++++++++++++++++++++
 2 files changed, 761 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index be1049a2ec..6b5341dd45 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -128,6 +128,52 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co
                                          ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
                                          mx, intptr_t my, int width);
 
+#define NEON8_FNPROTO(fn, args, ext) \
+    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##6_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##12_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##24_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##48_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+
+#define NEON8_FNPROTO_PARTIAL_4(fn, args, ext) \
+    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+
+NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width),);
+
+NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width),);
+
+#define NEON8_FNASSIGN(member, v, h, fn, ext) \
+        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
+        member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext;  \
+        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[4][v][h] = ff_hevc_put_hevc_##fn##12_8_neon##ext; \
+        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[6][v][h] = ff_hevc_put_hevc_##fn##24_8_neon##ext; \
+        member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
+        member[8][v][h] = ff_hevc_put_hevc_##fn##48_8_neon##ext; \
+        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
+
+#define NEON8_FNASSIGN_PARTIAL_4(member, v, h, fn, ext) \
+        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
+        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[7][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
+        member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
+        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
+
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
     if (!have_neon(av_get_cpu_flags())) return;
@@ -185,6 +231,11 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->put_hevc_qpel_bi[7][0][1]   =
         c->put_hevc_qpel_bi[8][0][1]   =
         c->put_hevc_qpel_bi[9][0][1]   = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
+
+        NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
+        NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
+
     }
     if (bit_depth == 10) {
         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 0e7b912678..51df52e1ea 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -30,6 +30,13 @@ const qpel_filters, align=4
         .byte           0,  1, -5, 17, 58,-10, 4, -1
 endconst
 
+const qpel_filters_abs, align=4
+        .byte           0,  0,  0,  0,  0,  0, 0,  0
+        .byte           1,  4, 10, 58, 17,  5, 1,  0
+        .byte           1,  4, 11, 40, 40, 11, 4,  1
+        .byte           0,  1,  5, 17, 58, 10, 4,  1
+endconst
+
 .macro load_filter m
         movrel          x15, qpel_filters
         add             x15, x15, \m, lsl #3
@@ -482,3 +489,706 @@ endfunc
 put_hevc qpel
 put_hevc qpel_uni
 put_hevc qpel_bi
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+1:
+        ldr             s0, [x2]
+        ldr             s1, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ushll           v0.8h, v0.8b, #6
+        ushll           v1.8h, v1.8b, #6
+        smull           v0.4s, v0.4h, v30.4h
+        smull           v1.4s, v1.4h, v30.4h
+        sqrshl          v0.4s, v0.4s, v31.4s
+        sqrshl          v1.4s, v1.4s, v31.4s
+        sqadd           v0.4s, v0.4s, v29.4s
+        sqadd           v1.4s, v1.4s, v29.4s
+        sqxtn           v0.4h, v0.4s
+        sqxtn           v1.4h, v1.4s
+        sqxtun          v0.8b, v0.8h
+        sqxtun          v1.8b, v1.8h
+        str             s0, [x0]
+        str             s1, [x0, x1]
+        add             x0, x0, x1, lsl #1
+        subs            w4, w4, #2
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels6_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+        sub             x1, x1, #4
+1:
+        ldr             d0, [x2]
+        ldr             d1, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ushll           v0.8h, v0.8b, #6
+        ushll           v1.8h, v1.8b, #6
+        smull           v4.4s, v0.4h, v30.4h
+        smull2          v5.4s, v0.8h, v30.8h
+        smull           v6.4s, v1.4h, v30.4h
+        smull2          v7.4s, v1.8h, v30.8h
+        sqrshl          v4.4s, v4.4s, v31.4s
+        sqrshl          v5.4s, v5.4s, v31.4s
+        sqrshl          v6.4s, v6.4s, v31.4s
+        sqrshl          v7.4s, v7.4s, v31.4s
+        sqadd           v4.4s, v4.4s, v29.4s
+        sqadd           v5.4s, v5.4s, v29.4s
+        sqadd           v6.4s, v6.4s, v29.4s
+        sqadd           v7.4s, v7.4s, v29.4s
+        sqxtn           v0.4h, v4.4s
+        sqxtn2          v0.8h, v5.4s
+        sqxtn           v1.4h, v6.4s
+        sqxtn2          v1.8h, v7.4s
+        sqxtun          v0.8b, v0.8h
+        sqxtun          v1.8b, v1.8h
+        str             s0, [x0], #4
+        st1             {v0.h}[2], [x0], x1
+        str             s1, [x0], #4
+        st1             {v1.h}[2], [x0], x1
+        subs            w4, w4, #2
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+1:
+        ldr             d0, [x2]
+        ldr             d1, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ushll           v0.8h, v0.8b, #6
+        ushll           v1.8h, v1.8b, #6
+        smull           v4.4s, v0.4h, v30.4h
+        smull2          v5.4s, v0.8h, v30.8h
+        smull           v6.4s, v1.4h, v30.4h
+        smull2          v7.4s, v1.8h, v30.8h
+        sqrshl          v4.4s, v4.4s, v31.4s
+        sqrshl          v5.4s, v5.4s, v31.4s
+        sqrshl          v6.4s, v6.4s, v31.4s
+        sqrshl          v7.4s, v7.4s, v31.4s
+        sqadd           v4.4s, v4.4s, v29.4s
+        sqadd           v5.4s, v5.4s, v29.4s
+        sqadd           v6.4s, v6.4s, v29.4s
+        sqadd           v7.4s, v7.4s, v29.4s
+        sqxtn           v0.4h, v4.4s
+        sqxtn2          v0.8h, v5.4s
+        sqxtn           v1.4h, v6.4s
+        sqxtn2          v1.8h, v7.4s
+        sqxtun          v0.8b, v0.8h
+        sqxtun          v1.8b, v1.8h
+        str             d0, [x0]
+        str             d1, [x0, x1]
+        add             x0, x0, x1, lsl #1
+        subs            w4, w4, #2
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels12_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+        sub             x1, x1, #8
+1:
+        ldr             q0, [x2]
+        ldr             q1, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ushll           v4.8h, v0.8b, #6
+        ushll2          v5.8h, v0.16b, #6
+        ushll           v6.8h, v1.8b, #6
+        ushll2          v7.8h, v1.16b, #6
+        smull           v16.4s, v4.4h, v30.4h
+        smull2          v17.4s, v4.8h, v30.8h
+        smull           v18.4s, v5.4h, v30.4h
+        smull2          v19.4s, v5.8h, v30.8h
+        smull           v20.4s, v6.4h, v30.4h
+        smull2          v21.4s, v6.8h, v30.8h
+        smull           v22.4s, v7.4h, v30.4h
+        smull2          v23.4s, v7.8h, v30.8h
+
+        sqrshl          v16.4s, v16.4s, v31.4s
+        sqrshl          v17.4s, v17.4s, v31.4s
+        sqrshl          v18.4s, v18.4s, v31.4s
+        sqrshl          v19.4s, v19.4s, v31.4s
+        sqrshl          v20.4s, v20.4s, v31.4s
+        sqrshl          v21.4s, v21.4s, v31.4s
+        sqrshl          v22.4s, v22.4s, v31.4s
+        sqrshl          v23.4s, v23.4s, v31.4s
+        sqadd           v16.4s, v16.4s, v29.4s
+        sqadd           v17.4s, v17.4s, v29.4s
+        sqadd           v18.4s, v18.4s, v29.4s
+        sqadd           v19.4s, v19.4s, v29.4s
+        sqadd           v20.4s, v20.4s, v29.4s
+        sqadd           v21.4s, v21.4s, v29.4s
+        sqadd           v22.4s, v22.4s, v29.4s
+        sqadd           v23.4s, v23.4s, v29.4s
+        sqxtn           v0.4h, v16.4s
+        sqxtn2          v0.8h, v17.4s
+        sqxtn           v1.4h, v18.4s
+        sqxtn2          v1.8h, v19.4s
+        sqxtn           v2.4h, v20.4s
+        sqxtn2          v2.8h, v21.4s
+        sqxtn           v3.4h, v22.4s
+        sqxtn2          v3.8h, v23.4s
+        sqxtun          v0.8b, v0.8h
+        sqxtun2         v0.16b, v1.8h
+        sqxtun          v2.8b, v2.8h
+        sqxtun2         v2.16b, v3.8h
+        str             d0, [x0], #8
+        st1             {v0.s}[2], [x0], x1
+        str             d2, [x0], #8
+        st1             {v2.s}[2], [x0], x1
+        subs            w4, w4, #2
+        b.ne            1b
+        ret
+endfunc
+
+.macro PEL_UNI_W_PIXEL_CALC     s0, t0, t1, d0, d1, d2, d3
+        ushll           \t0\().8h, \s0\().8b, #6
+        ushll2          \t1\().8h, \s0\().16b, #6
+        smull           \d0\().4s, \t0\().4h, v30.4h
+        smull2          \d1\().4s, \t0\().8h, v30.8h
+        smull           \d2\().4s, \t1\().4h, v30.4h
+        smull2          \d3\().4s, \t1\().8h, v30.8h
+        sqrshl          \d0\().4s, \d0\().4s, v31.4s
+        sqrshl          \d1\().4s, \d1\().4s, v31.4s
+        sqrshl          \d2\().4s, \d2\().4s, v31.4s
+        sqrshl          \d3\().4s, \d3\().4s, v31.4s
+        sqadd           \d0\().4s, \d0\().4s, v29.4s
+        sqadd           \d1\().4s, \d1\().4s, v29.4s
+        sqadd           \d2\().4s, \d2\().4s, v29.4s
+        sqadd           \d3\().4s, \d3\().4s, v29.4s
+        sqxtn           \t0\().4h, \d0\().4s
+        sqxtn2          \t0\().8h, \d1\().4s
+        sqxtn           \t1\().4h, \d2\().4s
+        sqxtn2          \t1\().8h, \d3\().4s
+        sqxtun          \s0\().8b,  \t0\().8h
+        sqxtun2         \s0\().16b, \t1\().8h
+.endm
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels16_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+1:
+        ldr             q0, [x2]
+        ldr             q1, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        str             q0, [x0]
+        str             q1, [x0, x1]
+        add             x0, x0, x1, lsl #1
+        subs            w4, w4, #2
+        b.ne            1b
+        ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels24_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+1:
+        ld1             {v0.16b, v1.16b}, [x2], x3
+        ushll           v4.8h, v0.8b, #6
+        ushll2          v5.8h, v0.16b, #6
+        ushll           v6.8h, v1.8b, #6
+        smull           v16.4s, v4.4h, v30.4h
+        smull2          v17.4s, v4.8h, v30.8h
+        smull           v18.4s, v5.4h, v30.4h
+        smull2          v19.4s, v5.8h, v30.8h
+        smull           v20.4s, v6.4h, v30.4h
+        smull2          v21.4s, v6.8h, v30.8h
+        sqrshl          v16.4s, v16.4s, v31.4s
+        sqrshl          v17.4s, v17.4s, v31.4s
+        sqrshl          v18.4s, v18.4s, v31.4s
+        sqrshl          v19.4s, v19.4s, v31.4s
+        sqrshl          v20.4s, v20.4s, v31.4s
+        sqrshl          v21.4s, v21.4s, v31.4s
+        sqadd           v16.4s, v16.4s, v29.4s
+        sqadd           v17.4s, v17.4s, v29.4s
+        sqadd           v18.4s, v18.4s, v29.4s
+        sqadd           v19.4s, v19.4s, v29.4s
+        sqadd           v20.4s, v20.4s, v29.4s
+        sqadd           v21.4s, v21.4s, v29.4s
+        sqxtn           v0.4h, v16.4s
+        sqxtn2          v0.8h, v17.4s
+        sqxtn           v1.4h, v18.4s
+        sqxtn2          v1.8h, v19.4s
+        sqxtn           v2.4h, v20.4s
+        sqxtn2          v2.8h, v21.4s
+        sqxtun          v0.8b, v0.8h
+        sqxtun          v1.8b, v1.8h
+        sqxtun          v2.8b, v2.8h
+        st1             {v0.8b, v1.8b, v2.8b}, [x0], x1
+        subs            w4, w4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels32_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+1:
+        ld1             {v0.16b, v1.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        st1             {v0.16b, v1.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.ne            1b
+        ret
+endfunc
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels48_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+1:
+        ld1             {v0.16b, v1.16b, v2.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
+        st1             {v0.16b, v1.16b, v2.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels64_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+1:
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v3, v6, v7, v20, v21, v22, v23
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.ne            1b
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_HEADER
+        ldur            x12, [sp, #8]          // my
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        movrel          x9, qpel_filters_abs
+        add             x9, x9, x12, lsl #3
+        ldr             d28, [x9]
+        dup             v0.16b, v28.b[0]
+        dup             v1.16b, v28.b[1]
+        dup             v2.16b, v28.b[2]
+        dup             v3.16b, v28.b[3]
+        dup             v4.16b, v28.b[4]
+        dup             v5.16b, v28.b[5]
+        dup             v6.16b, v28.b[6]
+        dup             v7.16b, v28.b[7]
+
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6              // wx
+        dup             v31.4s, w10             // shift
+        dup             v29.4s, w7              // ox
+.endm
+
+.macro QPEL_FILTER_B dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umull           \dst\().8h, \src1\().8b, v1.8b
+        umlsl           \dst\().8h, \src0\().8b, v0.8b
+        umlsl           \dst\().8h, \src2\().8b, v2.8b
+        umlal           \dst\().8h, \src3\().8b, v3.8b
+        umlal           \dst\().8h, \src4\().8b, v4.8b
+        umlsl           \dst\().8h, \src5\().8b, v5.8b
+        umlal           \dst\().8h, \src6\().8b, v6.8b
+        umlsl           \dst\().8h, \src7\().8b, v7.8b
+.endm
+
+.macro QPEL_FILTER_B2 dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umull2          \dst\().8h, \src1\().16b, v1.16b
+        umlsl2          \dst\().8h, \src0\().16b, v0.16b
+        umlsl2          \dst\().8h, \src2\().16b, v2.16b
+        umlal2          \dst\().8h, \src3\().16b, v3.16b
+        umlal2          \dst\().8h, \src4\().16b, v4.16b
+        umlsl2          \dst\().8h, \src5\().16b, v5.16b
+        umlal2          \dst\().8h, \src6\().16b, v6.16b
+        umlsl2          \dst\().8h, \src7\().16b, v7.16b
+.endm
+
+.macro  QPEL_UNI_W_V_4
+        smull           v24.4s, v24.4h, v30.4h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtun          v24.8b, v24.8h
+        st1             {v24.s}[0], [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v4_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             s16, [x2]
+        ldr             s17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s18, [x2]
+        ldr             s19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s20, [x2]
+        ldr             s21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s22, [x2]
+
+1:      ldr             s23, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v24, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s16, [x2]
+        QPEL_FILTER_B   v24, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v24, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s18, [x2]
+        QPEL_FILTER_B   v24, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v24, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s20, [x2]
+        QPEL_FILTER_B   v24, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v24, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s22, [x2]
+        QPEL_FILTER_B   v24, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_8
+        smull           v24.4s, v26.4h, v30.4h
+        smull2          v25.4s, v26.8h, v30.8h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqrshl          v25.4s, v25.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqadd           v25.4s, v25.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtn2          v24.8h, v25.4s
+        sqxtun          v24.8b, v24.8h
+        st1             {v24.d}[0], [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v8_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             d16, [x2]
+        ldr             d17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d18, [x2]
+        ldr             d19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d20, [x2]
+        ldr             d21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d22, [x2]
+
+1:      ldr             d23, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d16, [x2]
+        QPEL_FILTER_B   v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d18, [x2]
+        QPEL_FILTER_B   v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d20, [x2]
+        QPEL_FILTER_B   v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d22, [x2]
+        QPEL_FILTER_B   v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_16
+        smull           v24.4s, v26.4h, v30.4h
+        smull2          v25.4s, v26.8h, v30.8h
+        smull           v26.4s, v27.4h, v30.4h
+        smull2          v27.4s, v27.8h, v30.8h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqrshl          v25.4s, v25.4s, v31.4s
+        sqrshl          v26.4s, v26.4s, v31.4s
+        sqrshl          v27.4s, v27.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqadd           v25.4s, v25.4s, v29.4s
+        sqadd           v26.4s, v26.4s, v29.4s
+        sqadd           v27.4s, v27.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtn2          v24.8h, v25.4s
+        sqxtn           v26.4h, v26.4s
+        sqxtn2          v26.8h, v27.4s
+        sqxtun          v24.8b, v24.8h
+        sqxtun2         v24.16b, v26.8h
+        st1             {v24.16b}, [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v16_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             q16, [x2]
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q18, [x2]
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q20, [x2]
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q22, [x2]
+
+1:      ldr             q23, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_B2  v27, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q16, [x2]
+        QPEL_FILTER_B   v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_B2  v27, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_B2  v27, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q18, [x2]
+        QPEL_FILTER_B   v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_B2  v27, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_B2  v27, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q20, [x2]
+        QPEL_FILTER_B   v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_B2  v27, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_B2  v27, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q22, [x2]
+        QPEL_FILTER_B   v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_B2  v27, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldur            w13, [sp, #16]
+        mov             x14, x0
+        mov             x15, x2
+        mov             w11, w4
+
+3:
+        ldr             q16, [x2]
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q18, [x2]
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q20, [x2]
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q22, [x2]
+
+
+1:      ldr             q23, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_B2  v27, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q16, [x2]
+        QPEL_FILTER_B   v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_B2  v27, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_B2  v27, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q18, [x2]
+        QPEL_FILTER_B   v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_B2  v27, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_B2  v27, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q20, [x2]
+        QPEL_FILTER_B   v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_B2  v27, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_B2  v27, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q22, [x2]
+        QPEL_FILTER_B   v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_B2  v27, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        subs            w13, w13, #16
+        add             x14, x14, #16
+        add             x15, x15, #16
+        mov             x0, x14
+        mov             x2, x15
+        mov             w4, w11
+        b.hi            3b
+        ret
+endfunc
Martin Storsjö June 1, 2023, 11:23 a.m. UTC | #12
On Sun, 28 May 2023, Logan.Lyu wrote:

>
> 在 2023/5/28 12:36, Jean-Baptiste Kempf 写道:
>> Hello,
>> 
>> The last interaction still has the wrong name in patchset.
> Thanks for reminding.  I modified the correct name in git.

Thanks, most of the issues in the patch seem to have been fixed - however 
there's one big breakage here. Also even if this is accepted, we'll have 
to wait for the dependency patches to be merged before these can go in 
though.

For restoring the saved registers on the stack, you currently have this:

         ldp             x19, x30, [sp]
         ldp             x26, x27, [sp, #16]
         ldp             x24, x25, [sp, #32]
         ldp             x22, x23, [sp, #48]
         ldp             x20, x21, [sp, #64]
         add             sp, sp, #80

You can avoid the extra add at the end by reordering them like this:

         ldp             x26, x27, [sp, #16]
         ldp             x24, x25, [sp, #32]
         ldp             x22, x23, [sp, #48]
         ldp             x20, x21, [sp, #64]
         ldp             x19, x30, [sp], #80

But the order/layout of the registers doesn't match how they are backed 
up. So when you run checkasm, you'll get these errors:

I8MM:
  - hevc_pel.qpel                   [OK]
    put_hevc_qpel_uni_w_hv4_8_i8mm (failed to preserve register)
    put_hevc_qpel_uni_w_hv8_8_i8mm (failed to preserve register)
    put_hevc_qpel_uni_w_hv16_8_i8mm (failed to preserve register)
    put_hevc_qpel_uni_w_hv32_8_i8mm (failed to preserve register)
    put_hevc_qpel_uni_w_hv64_8_i8mm (failed to preserve register)
  - hevc_pel.qpel_uni_w             [FAILED]
checkasm: 5 of 1136 tests have failed

It's easiest to make the epilogue a mirror copy of the prologue.

Please rerun checkasm on as system that does support i8mm when posting 
updated patches.

// Martin
Logan.Lyu June 2, 2023, 12:47 p.m. UTC | #13
Hi, Martin,

I'm sorry I made a stupid mistake, And it's fixed now.

If these patches are acceptable to you, I will submit some similar 
patches soon.

Thanks.


在 2023/6/1 19:23, Martin Storsjö 写道:
> On Sun, 28 May 2023, Logan.Lyu wrote:
>
>>
>> 在 2023/5/28 12:36, Jean-Baptiste Kempf 写道:
>>> Hello,
>>>
>>> The last interaction still has the wrong name in patchset.
>> Thanks for reminding.  I modified the correct name in git.
>
> Thanks, most of the issues in the patch seem to have been fixed - 
> however there's one big breakage here. Also even if this is accepted, 
> we'll have to wait for the dependency patches to be merged before 
> these can go in though.
>
> For restoring the saved registers on the stack, you currently have this:
>
>         ldp             x19, x30, [sp]
>         ldp             x26, x27, [sp, #16]
>         ldp             x24, x25, [sp, #32]
>         ldp             x22, x23, [sp, #48]
>         ldp             x20, x21, [sp, #64]
>         add             sp, sp, #80
>
> You can avoid the extra add at the end by reordering them like this:
>
>         ldp             x26, x27, [sp, #16]
>         ldp             x24, x25, [sp, #32]
>         ldp             x22, x23, [sp, #48]
>         ldp             x20, x21, [sp, #64]
>         ldp             x19, x30, [sp], #80
>
> But the order/layout of the registers doesn't match how they are 
> backed up. So when you run checkasm, you'll get these errors:
>
> I8MM:
>  - hevc_pel.qpel                   [OK]
>    put_hevc_qpel_uni_w_hv4_8_i8mm (failed to preserve register)
>    put_hevc_qpel_uni_w_hv8_8_i8mm (failed to preserve register)
>    put_hevc_qpel_uni_w_hv16_8_i8mm (failed to preserve register)
>    put_hevc_qpel_uni_w_hv32_8_i8mm (failed to preserve register)
>    put_hevc_qpel_uni_w_hv64_8_i8mm (failed to preserve register)
>  - hevc_pel.qpel_uni_w             [FAILED]
> checkasm: 5 of 1136 tests have failed
>
> It's easiest to make the epilogue a mirror copy of the prologue.
>
> Please rerun checkasm on as system that does support i8mm when posting 
> updated patches.
>
> // Martin
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
From 8d5875ab393828b83163b98eb4b35837120f1322 Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu@myais.com.cn>
Date: Wed, 3 May 2023 09:53:07 +0800
Subject: [PATCH 1/3] lavc/aarch64: new optimization for 8-bit
 hevc_pel_uni_w_pixels and qpel_uni_w_v

---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  51 ++
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 710 ++++++++++++++++++++++
 2 files changed, 761 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index be1049a2ec..6b5341dd45 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -128,6 +128,52 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co
                                          ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
                                          mx, intptr_t my, int width);
 
+#define NEON8_FNPROTO(fn, args, ext) \
+    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##6_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##12_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##24_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##48_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+
+#define NEON8_FNPROTO_PARTIAL_4(fn, args, ext) \
+    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+
+NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width),);
+
+NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width),);
+
+#define NEON8_FNASSIGN(member, v, h, fn, ext) \
+        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
+        member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext;  \
+        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[4][v][h] = ff_hevc_put_hevc_##fn##12_8_neon##ext; \
+        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[6][v][h] = ff_hevc_put_hevc_##fn##24_8_neon##ext; \
+        member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
+        member[8][v][h] = ff_hevc_put_hevc_##fn##48_8_neon##ext; \
+        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
+
+#define NEON8_FNASSIGN_PARTIAL_4(member, v, h, fn, ext) \
+        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
+        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[7][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
+        member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
+        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
+
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
     if (!have_neon(av_get_cpu_flags())) return;
@@ -185,6 +231,11 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->put_hevc_qpel_bi[7][0][1]   =
         c->put_hevc_qpel_bi[8][0][1]   =
         c->put_hevc_qpel_bi[9][0][1]   = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
+
+        NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
+        NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
+
     }
     if (bit_depth == 10) {
         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 0e7b912678..51df52e1ea 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -30,6 +30,13 @@ const qpel_filters, align=4
         .byte           0,  1, -5, 17, 58,-10, 4, -1
 endconst
 
+const qpel_filters_abs, align=4
+        .byte           0,  0,  0,  0,  0,  0, 0,  0
+        .byte           1,  4, 10, 58, 17,  5, 1,  0
+        .byte           1,  4, 11, 40, 40, 11, 4,  1
+        .byte           0,  1,  5, 17, 58, 10, 4,  1
+endconst
+
 .macro load_filter m
         movrel          x15, qpel_filters
         add             x15, x15, \m, lsl #3
@@ -482,3 +489,706 @@ endfunc
 put_hevc qpel
 put_hevc qpel_uni
 put_hevc qpel_bi
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+1:
+        ldr             s0, [x2]
+        ldr             s1, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ushll           v0.8h, v0.8b, #6
+        ushll           v1.8h, v1.8b, #6
+        smull           v0.4s, v0.4h, v30.4h
+        smull           v1.4s, v1.4h, v30.4h
+        sqrshl          v0.4s, v0.4s, v31.4s
+        sqrshl          v1.4s, v1.4s, v31.4s
+        sqadd           v0.4s, v0.4s, v29.4s
+        sqadd           v1.4s, v1.4s, v29.4s
+        sqxtn           v0.4h, v0.4s
+        sqxtn           v1.4h, v1.4s
+        sqxtun          v0.8b, v0.8h
+        sqxtun          v1.8b, v1.8h
+        str             s0, [x0]
+        str             s1, [x0, x1]
+        add             x0, x0, x1, lsl #1
+        subs            w4, w4, #2
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels6_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+        sub             x1, x1, #4
+1:
+        ldr             d0, [x2]
+        ldr             d1, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ushll           v0.8h, v0.8b, #6
+        ushll           v1.8h, v1.8b, #6
+        smull           v4.4s, v0.4h, v30.4h
+        smull2          v5.4s, v0.8h, v30.8h
+        smull           v6.4s, v1.4h, v30.4h
+        smull2          v7.4s, v1.8h, v30.8h
+        sqrshl          v4.4s, v4.4s, v31.4s
+        sqrshl          v5.4s, v5.4s, v31.4s
+        sqrshl          v6.4s, v6.4s, v31.4s
+        sqrshl          v7.4s, v7.4s, v31.4s
+        sqadd           v4.4s, v4.4s, v29.4s
+        sqadd           v5.4s, v5.4s, v29.4s
+        sqadd           v6.4s, v6.4s, v29.4s
+        sqadd           v7.4s, v7.4s, v29.4s
+        sqxtn           v0.4h, v4.4s
+        sqxtn2          v0.8h, v5.4s
+        sqxtn           v1.4h, v6.4s
+        sqxtn2          v1.8h, v7.4s
+        sqxtun          v0.8b, v0.8h
+        sqxtun          v1.8b, v1.8h
+        str             s0, [x0], #4
+        st1             {v0.h}[2], [x0], x1
+        str             s1, [x0], #4
+        st1             {v1.h}[2], [x0], x1
+        subs            w4, w4, #2
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+1:
+        ldr             d0, [x2]
+        ldr             d1, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ushll           v0.8h, v0.8b, #6
+        ushll           v1.8h, v1.8b, #6
+        smull           v4.4s, v0.4h, v30.4h
+        smull2          v5.4s, v0.8h, v30.8h
+        smull           v6.4s, v1.4h, v30.4h
+        smull2          v7.4s, v1.8h, v30.8h
+        sqrshl          v4.4s, v4.4s, v31.4s
+        sqrshl          v5.4s, v5.4s, v31.4s
+        sqrshl          v6.4s, v6.4s, v31.4s
+        sqrshl          v7.4s, v7.4s, v31.4s
+        sqadd           v4.4s, v4.4s, v29.4s
+        sqadd           v5.4s, v5.4s, v29.4s
+        sqadd           v6.4s, v6.4s, v29.4s
+        sqadd           v7.4s, v7.4s, v29.4s
+        sqxtn           v0.4h, v4.4s
+        sqxtn2          v0.8h, v5.4s
+        sqxtn           v1.4h, v6.4s
+        sqxtn2          v1.8h, v7.4s
+        sqxtun          v0.8b, v0.8h
+        sqxtun          v1.8b, v1.8h
+        str             d0, [x0]
+        str             d1, [x0, x1]
+        add             x0, x0, x1, lsl #1
+        subs            w4, w4, #2
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels12_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+        sub             x1, x1, #8
+1:
+        ldr             q0, [x2]
+        ldr             q1, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ushll           v4.8h, v0.8b, #6
+        ushll2          v5.8h, v0.16b, #6
+        ushll           v6.8h, v1.8b, #6
+        ushll2          v7.8h, v1.16b, #6
+        smull           v16.4s, v4.4h, v30.4h
+        smull2          v17.4s, v4.8h, v30.8h
+        smull           v18.4s, v5.4h, v30.4h
+        smull2          v19.4s, v5.8h, v30.8h
+        smull           v20.4s, v6.4h, v30.4h
+        smull2          v21.4s, v6.8h, v30.8h
+        smull           v22.4s, v7.4h, v30.4h
+        smull2          v23.4s, v7.8h, v30.8h
+
+        sqrshl          v16.4s, v16.4s, v31.4s
+        sqrshl          v17.4s, v17.4s, v31.4s
+        sqrshl          v18.4s, v18.4s, v31.4s
+        sqrshl          v19.4s, v19.4s, v31.4s
+        sqrshl          v20.4s, v20.4s, v31.4s
+        sqrshl          v21.4s, v21.4s, v31.4s
+        sqrshl          v22.4s, v22.4s, v31.4s
+        sqrshl          v23.4s, v23.4s, v31.4s
+        sqadd           v16.4s, v16.4s, v29.4s
+        sqadd           v17.4s, v17.4s, v29.4s
+        sqadd           v18.4s, v18.4s, v29.4s
+        sqadd           v19.4s, v19.4s, v29.4s
+        sqadd           v20.4s, v20.4s, v29.4s
+        sqadd           v21.4s, v21.4s, v29.4s
+        sqadd           v22.4s, v22.4s, v29.4s
+        sqadd           v23.4s, v23.4s, v29.4s
+        sqxtn           v0.4h, v16.4s
+        sqxtn2          v0.8h, v17.4s
+        sqxtn           v1.4h, v18.4s
+        sqxtn2          v1.8h, v19.4s
+        sqxtn           v2.4h, v20.4s
+        sqxtn2          v2.8h, v21.4s
+        sqxtn           v3.4h, v22.4s
+        sqxtn2          v3.8h, v23.4s
+        sqxtun          v0.8b, v0.8h
+        sqxtun2         v0.16b, v1.8h
+        sqxtun          v2.8b, v2.8h
+        sqxtun2         v2.16b, v3.8h
+        str             d0, [x0], #8
+        st1             {v0.s}[2], [x0], x1
+        str             d2, [x0], #8
+        st1             {v2.s}[2], [x0], x1
+        subs            w4, w4, #2
+        b.ne            1b
+        ret
+endfunc
+
+.macro PEL_UNI_W_PIXEL_CALC     s0, t0, t1, d0, d1, d2, d3
+        ushll           \t0\().8h, \s0\().8b, #6
+        ushll2          \t1\().8h, \s0\().16b, #6
+        smull           \d0\().4s, \t0\().4h, v30.4h
+        smull2          \d1\().4s, \t0\().8h, v30.8h
+        smull           \d2\().4s, \t1\().4h, v30.4h
+        smull2          \d3\().4s, \t1\().8h, v30.8h
+        sqrshl          \d0\().4s, \d0\().4s, v31.4s
+        sqrshl          \d1\().4s, \d1\().4s, v31.4s
+        sqrshl          \d2\().4s, \d2\().4s, v31.4s
+        sqrshl          \d3\().4s, \d3\().4s, v31.4s
+        sqadd           \d0\().4s, \d0\().4s, v29.4s
+        sqadd           \d1\().4s, \d1\().4s, v29.4s
+        sqadd           \d2\().4s, \d2\().4s, v29.4s
+        sqadd           \d3\().4s, \d3\().4s, v29.4s
+        sqxtn           \t0\().4h, \d0\().4s
+        sqxtn2          \t0\().8h, \d1\().4s
+        sqxtn           \t1\().4h, \d2\().4s
+        sqxtn2          \t1\().8h, \d3\().4s
+        sqxtun          \s0\().8b,  \t0\().8h
+        sqxtun2         \s0\().16b, \t1\().8h
+.endm
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels16_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+1:
+        ldr             q0, [x2]
+        ldr             q1, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        str             q0, [x0]
+        str             q1, [x0, x1]
+        add             x0, x0, x1, lsl #1
+        subs            w4, w4, #2
+        b.ne            1b
+        ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels24_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+1:
+        ld1             {v0.16b, v1.16b}, [x2], x3
+        ushll           v4.8h, v0.8b, #6
+        ushll2          v5.8h, v0.16b, #6
+        ushll           v6.8h, v1.8b, #6
+        smull           v16.4s, v4.4h, v30.4h
+        smull2          v17.4s, v4.8h, v30.8h
+        smull           v18.4s, v5.4h, v30.4h
+        smull2          v19.4s, v5.8h, v30.8h
+        smull           v20.4s, v6.4h, v30.4h
+        smull2          v21.4s, v6.8h, v30.8h
+        sqrshl          v16.4s, v16.4s, v31.4s
+        sqrshl          v17.4s, v17.4s, v31.4s
+        sqrshl          v18.4s, v18.4s, v31.4s
+        sqrshl          v19.4s, v19.4s, v31.4s
+        sqrshl          v20.4s, v20.4s, v31.4s
+        sqrshl          v21.4s, v21.4s, v31.4s
+        sqadd           v16.4s, v16.4s, v29.4s
+        sqadd           v17.4s, v17.4s, v29.4s
+        sqadd           v18.4s, v18.4s, v29.4s
+        sqadd           v19.4s, v19.4s, v29.4s
+        sqadd           v20.4s, v20.4s, v29.4s
+        sqadd           v21.4s, v21.4s, v29.4s
+        sqxtn           v0.4h, v16.4s
+        sqxtn2          v0.8h, v17.4s
+        sqxtn           v1.4h, v18.4s
+        sqxtn2          v1.8h, v19.4s
+        sqxtn           v2.4h, v20.4s
+        sqxtn2          v2.8h, v21.4s
+        sqxtun          v0.8b, v0.8h
+        sqxtun          v1.8b, v1.8h
+        sqxtun          v2.8b, v2.8h
+        st1             {v0.8b, v1.8b, v2.8b}, [x0], x1
+        subs            w4, w4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels32_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+1:
+        ld1             {v0.16b, v1.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        st1             {v0.16b, v1.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.ne            1b
+        ret
+endfunc
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels48_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+1:
+        ld1             {v0.16b, v1.16b, v2.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
+        st1             {v0.16b, v1.16b, v2.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels64_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+1:
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v3, v6, v7, v20, v21, v22, v23
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.ne            1b
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_HEADER
+        ldur            x12, [sp, #8]          // my
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        movrel          x9, qpel_filters_abs
+        add             x9, x9, x12, lsl #3
+        ldr             d28, [x9]
+        dup             v0.16b, v28.b[0]
+        dup             v1.16b, v28.b[1]
+        dup             v2.16b, v28.b[2]
+        dup             v3.16b, v28.b[3]
+        dup             v4.16b, v28.b[4]
+        dup             v5.16b, v28.b[5]
+        dup             v6.16b, v28.b[6]
+        dup             v7.16b, v28.b[7]
+
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6              // wx
+        dup             v31.4s, w10             // shift
+        dup             v29.4s, w7              // ox
+.endm
+
+.macro QPEL_FILTER_B dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umull           \dst\().8h, \src1\().8b, v1.8b
+        umlsl           \dst\().8h, \src0\().8b, v0.8b
+        umlsl           \dst\().8h, \src2\().8b, v2.8b
+        umlal           \dst\().8h, \src3\().8b, v3.8b
+        umlal           \dst\().8h, \src4\().8b, v4.8b
+        umlsl           \dst\().8h, \src5\().8b, v5.8b
+        umlal           \dst\().8h, \src6\().8b, v6.8b
+        umlsl           \dst\().8h, \src7\().8b, v7.8b
+.endm
+
+.macro QPEL_FILTER_B2 dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umull2          \dst\().8h, \src1\().16b, v1.16b
+        umlsl2          \dst\().8h, \src0\().16b, v0.16b
+        umlsl2          \dst\().8h, \src2\().16b, v2.16b
+        umlal2          \dst\().8h, \src3\().16b, v3.16b
+        umlal2          \dst\().8h, \src4\().16b, v4.16b
+        umlsl2          \dst\().8h, \src5\().16b, v5.16b
+        umlal2          \dst\().8h, \src6\().16b, v6.16b
+        umlsl2          \dst\().8h, \src7\().16b, v7.16b
+.endm
+
+.macro  QPEL_UNI_W_V_4
+        smull           v24.4s, v24.4h, v30.4h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtun          v24.8b, v24.8h
+        st1             {v24.s}[0], [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v4_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             s16, [x2]
+        ldr             s17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s18, [x2]
+        ldr             s19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s20, [x2]
+        ldr             s21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s22, [x2]
+
+1:      ldr             s23, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v24, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s16, [x2]
+        QPEL_FILTER_B   v24, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v24, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s18, [x2]
+        QPEL_FILTER_B   v24, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v24, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s20, [x2]
+        QPEL_FILTER_B   v24, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v24, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s22, [x2]
+        QPEL_FILTER_B   v24, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_8
+        smull           v24.4s, v26.4h, v30.4h
+        smull2          v25.4s, v26.8h, v30.8h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqrshl          v25.4s, v25.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqadd           v25.4s, v25.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtn2          v24.8h, v25.4s
+        sqxtun          v24.8b, v24.8h
+        st1             {v24.d}[0], [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v8_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             d16, [x2]
+        ldr             d17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d18, [x2]
+        ldr             d19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d20, [x2]
+        ldr             d21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d22, [x2]
+
+1:      ldr             d23, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d16, [x2]
+        QPEL_FILTER_B   v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d18, [x2]
+        QPEL_FILTER_B   v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d20, [x2]
+        QPEL_FILTER_B   v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d22, [x2]
+        QPEL_FILTER_B   v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_16
+        smull           v24.4s, v26.4h, v30.4h
+        smull2          v25.4s, v26.8h, v30.8h
+        smull           v26.4s, v27.4h, v30.4h
+        smull2          v27.4s, v27.8h, v30.8h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqrshl          v25.4s, v25.4s, v31.4s
+        sqrshl          v26.4s, v26.4s, v31.4s
+        sqrshl          v27.4s, v27.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqadd           v25.4s, v25.4s, v29.4s
+        sqadd           v26.4s, v26.4s, v29.4s
+        sqadd           v27.4s, v27.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtn2          v24.8h, v25.4s
+        sqxtn           v26.4h, v26.4s
+        sqxtn2          v26.8h, v27.4s
+        sqxtun          v24.8b, v24.8h
+        sqxtun2         v24.16b, v26.8h
+        st1             {v24.16b}, [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v16_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             q16, [x2]
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q18, [x2]
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q20, [x2]
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q22, [x2]
+
+1:      ldr             q23, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_B2  v27, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q16, [x2]
+        QPEL_FILTER_B   v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_B2  v27, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_B2  v27, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q18, [x2]
+        QPEL_FILTER_B   v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_B2  v27, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_B2  v27, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q20, [x2]
+        QPEL_FILTER_B   v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_B2  v27, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_B2  v27, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q22, [x2]
+        QPEL_FILTER_B   v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_B2  v27, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldur            w13, [sp, #16]
+        mov             x14, x0
+        mov             x15, x2
+        mov             w11, w4
+
+3:
+        ldr             q16, [x2]
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q18, [x2]
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q20, [x2]
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q22, [x2]
+
+
+1:      ldr             q23, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_B2  v27, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q16, [x2]
+        QPEL_FILTER_B   v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_B2  v27, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_B2  v27, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q18, [x2]
+        QPEL_FILTER_B   v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_B2  v27, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_B2  v27, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q20, [x2]
+        QPEL_FILTER_B   v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_B2  v27, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_B2  v27, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q22, [x2]
+        QPEL_FILTER_B   v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_B2  v27, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        subs            w13, w13, #16
+        add             x14, x14, #16
+        add             x15, x15, #16
+        mov             x0, x14
+        mov             x2, x15
+        mov             w4, w11
+        b.hi            3b
+        ret
+endfunc
Martin Storsjö June 3, 2023, 8:50 p.m. UTC | #14
On Fri, 2 Jun 2023, Logan.Lyu wrote:

> I'm sorry I made a stupid mistake, And it's fixed now.

Thanks, these look fine to me. I'll push them after the prerequisite 
patches are pushed.

> If these patches are acceptable to you, I will submit some similar patches 
> soon.

Sure, that should be ok now.

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index be1049a2ec..42b8e9169d 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -128,6 +128,91 @@  void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t 
*_dst, ptrdiff_t _dststride, co
                                           ptrdiff_t _srcstride, const 
int16_t *src2, int height, intptr_t
                                           mx, intptr_t my, int width);
  +#define NEON8_FNPROTO(fn, args, ext) \
+    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##6_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##12_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##24_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##48_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+
+#define NEON8_FNPROTO_PARTIAL_4(fn, args, ext) \
+    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+
+#define NEON8_FNPROTO_PARTIAL_5(fn, args, ext) \
+    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+
+
+NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox, +        intptr_t mx, 
intptr_t my, int width),);
+
+NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t 
_dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width),);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+NEON8_FNPROTO(qpel_h, (int16_t *dst,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, intptr_t mx, intptr_t my, int width), _dotprod);
+
+NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width), _dotprod);
+
+NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t 
_dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width), _dotprod);
+
+#endif
+
+#define NEON8_FNASSIGN(member, v, h, fn, ext) \
+        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
+        member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext;  \
+        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[4][v][h] = ff_hevc_put_hevc_##fn##12_8_neon##ext; \
+        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[6][v][h] = ff_hevc_put_hevc_##fn##24_8_neon##ext; \
+        member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
+        member[8][v][h] = ff_hevc_put_hevc_##fn##48_8_neon##ext; \
+        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
+
+#define NEON8_FNASSIGN_PARTIAL_4(member, v, h, fn, ext) \
+        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
+        member[2][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[4][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[6][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
+        member[7][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
+        member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
+        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
+
+#define NEON8_FNASSIGN_PARTIAL_5(member, v, h, fn, ext) \
+        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
+        member[2][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[4][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[6][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
+        member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
+        member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
+        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
+
  av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int 
bit_depth)
  {
      if (!have_neon(av_get_cpu_flags())) return;
@@ -185,6 +270,17 @@  av_cold void 
ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
          c->put_hevc_qpel_bi[7][0][1]   =
          c->put_hevc_qpel_bi[8][0][1]   =
          c->put_hevc_qpel_bi[9][0][1]   = 
ff_hevc_put_hevc_qpel_bi_h16_8_neon;
+
+        NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, 
pel_uni_w_pixels,); + 
NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
+
+    #if defined(__ARM_FEATURE_DOTPROD)
+        NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _dotprod);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, 
_dotprod);
+        NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, 
qpel_uni_w_hv, _dotprod);
+
+    #endif
      }
      if (bit_depth == 10) {
          c->hevc_h_loop_filter_chroma   = 
ff_hevc_h_loop_filter_chroma_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S 
b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 0e7b912678..e30ac1b465 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -30,6 +30,13 @@  const qpel_filters, align=4
          .byte           0,  1, -5, 17, 58,-10, 4, -1
  endconst
  +const qpel_filters_abs, align=4
+        .byte           0,  0,  0,  0,  0,  0, 0,  0
+        .byte           1,  4, 10, 58, 17,  5, 1,  0
+        .byte           1,  4, 11, 40, 40, 11, 4,  1
+        .byte           0,  1,  5, 17, 58, 10, 4,  1
+endconst
+
  .macro load_filter m
          movrel          x15, qpel_filters
          add             x15, x15, \m, lsl #3
@@ -482,3 +489,2219 @@  endfunc