diff mbox series

[FFmpeg-devel,2/2] lavc/aarch64: add hevc epel/qpel assembly

Message ID 20210428195028.80000-3-josh@itanimul.li
State New
Headers show
Series ARM64 HEVC QPEL/EPEL | expand

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate fail Make fate failed
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate warning Make fate failed

Commit Message

Josh Dekker April 28, 2021, 7:50 p.m. UTC
From: Rafal Dabrowa <fatwildcat@gmail.com>

Benchmarked on Apple M1:

put_hevc_epel_bi_h4_8_c: 69.9
put_hevc_epel_bi_h4_8_neon: 15.4
put_hevc_epel_bi_h6_8_c: 137.1
put_hevc_epel_bi_h6_8_neon: 31.9
put_hevc_epel_bi_h8_8_c: 124.6
put_hevc_epel_bi_h8_8_neon: 40.9
put_hevc_epel_bi_h12_8_c: 331.9
put_hevc_epel_bi_h12_8_neon: 72.4
put_hevc_epel_bi_h16_8_c: 383.4
put_hevc_epel_bi_h16_8_neon: 124.9
put_hevc_epel_bi_h24_8_c: 771.6
put_hevc_epel_bi_h24_8_neon: 209.6
put_hevc_epel_bi_h32_8_c: 1324.4
put_hevc_epel_bi_h32_8_neon: 389.4
put_hevc_epel_bi_h48_8_c: 2869.6
put_hevc_epel_bi_h48_8_neon: 730.1
put_hevc_epel_bi_h64_8_c: 4992.6
put_hevc_epel_bi_h64_8_neon: 1490.4
put_hevc_epel_bi_hv4_8_c: 163.4
put_hevc_epel_bi_hv4_8_neon: 38.4
put_hevc_epel_bi_hv6_8_c: 292.4
put_hevc_epel_bi_hv6_8_neon: 66.4
put_hevc_epel_bi_hv8_8_c: 375.6
put_hevc_epel_bi_hv8_8_neon: 62.4
put_hevc_epel_bi_hv12_8_c: 831.6
put_hevc_epel_bi_hv12_8_neon: 134.9
put_hevc_epel_bi_hv16_8_c: 1257.9
put_hevc_epel_bi_hv16_8_neon: 214.1
put_hevc_epel_bi_hv24_8_c: 2666.6
put_hevc_epel_bi_hv24_8_neon: 391.1
put_hevc_epel_bi_hv32_8_c: 4722.4
put_hevc_epel_bi_hv32_8_neon: 734.1
put_hevc_epel_bi_hv48_8_c: 10100.4
put_hevc_epel_bi_hv48_8_neon: 1570.4
put_hevc_epel_bi_hv64_8_c: 17613.4
put_hevc_epel_bi_hv64_8_neon: 2810.6
put_hevc_epel_bi_v4_8_c: 77.4
put_hevc_epel_bi_v4_8_neon: 18.6
put_hevc_epel_bi_v6_8_c: 142.1
put_hevc_epel_bi_v6_8_neon: 27.1
put_hevc_epel_bi_v8_8_c: 192.9
put_hevc_epel_bi_v8_8_neon: 9.1
put_hevc_epel_bi_v12_8_c: 415.6
put_hevc_epel_bi_v12_8_neon: 55.6
put_hevc_epel_bi_v16_8_c: 487.6
put_hevc_epel_bi_v16_8_neon: 61.9
put_hevc_epel_bi_v24_8_c: 957.4
put_hevc_epel_bi_v24_8_neon: 131.1
put_hevc_epel_bi_v32_8_c: 1540.4
put_hevc_epel_bi_v32_8_neon: 210.4
put_hevc_epel_bi_v48_8_c: 3242.9
put_hevc_epel_bi_v48_8_neon: 465.6
put_hevc_epel_bi_v64_8_c: 5441.1
put_hevc_epel_bi_v64_8_neon: 818.1
put_hevc_epel_h4_8_c: 41.6
put_hevc_epel_h4_8_neon: 8.4
put_hevc_epel_h6_8_c: 110.1
put_hevc_epel_h6_8_neon: 24.4
put_hevc_epel_h8_8_c: 41.6
put_hevc_epel_h8_8_neon: 17.6
put_hevc_epel_h12_8_c: 183.1
put_hevc_epel_h12_8_neon: 58.1
put_hevc_epel_h16_8_c: 146.6
put_hevc_epel_h16_8_neon: 83.4
put_hevc_epel_h24_8_c: 240.4
put_hevc_epel_h24_8_neon: 157.1
put_hevc_epel_h32_8_c: 431.1
put_hevc_epel_h32_8_neon: 292.1
put_hevc_epel_h48_8_c: 858.6
put_hevc_epel_h48_8_neon: 557.4
put_hevc_epel_h64_8_c: 1536.6
put_hevc_epel_h64_8_neon: 1116.6
put_hevc_epel_hv4_8_c: 152.6
put_hevc_epel_hv4_8_neon: 34.9
put_hevc_epel_hv6_8_c: 269.6
put_hevc_epel_hv6_8_neon: 61.6
put_hevc_epel_hv8_8_c: 307.4
put_hevc_epel_hv8_8_neon: 76.9
put_hevc_epel_hv12_8_c: 702.6
put_hevc_epel_hv12_8_neon: 113.1
put_hevc_epel_hv16_8_c: 1081.4
put_hevc_epel_hv16_8_neon: 190.6
put_hevc_epel_hv24_8_c: 2276.1
put_hevc_epel_hv24_8_neon: 345.1
put_hevc_epel_hv32_8_c: 4068.6
put_hevc_epel_hv32_8_neon: 780.4
put_hevc_epel_hv48_8_c: 8754.1
put_hevc_epel_hv48_8_neon: 1394.4
put_hevc_epel_hv64_8_c: 15402.1
put_hevc_epel_hv64_8_neon: 2616.6
put_hevc_epel_uni_hv4_8_c: 142.1
put_hevc_epel_uni_hv4_8_neon: 46.6
put_hevc_epel_uni_hv6_8_c: 298.4
put_hevc_epel_uni_hv6_8_neon: 72.4
put_hevc_epel_uni_hv8_8_c: 352.9
put_hevc_epel_uni_hv8_8_neon: 75.1
put_hevc_epel_uni_hv12_8_c: 776.6
put_hevc_epel_uni_hv12_8_neon: 125.9
put_hevc_epel_uni_hv16_8_c: 1216.1
put_hevc_epel_uni_hv16_8_neon: 199.1
put_hevc_epel_uni_hv24_8_c: 2577.9
put_hevc_epel_uni_hv24_8_neon: 386.6
put_hevc_epel_uni_hv32_8_c: 4554.9
put_hevc_epel_uni_hv32_8_neon: 710.9
put_hevc_epel_uni_hv48_8_c: 9869.1
put_hevc_epel_uni_hv48_8_neon: 1499.4
put_hevc_epel_uni_hv64_8_c: 17307.1
put_hevc_epel_uni_hv64_8_neon: 2750.6
put_hevc_epel_uni_v4_8_c: 59.9
put_hevc_epel_uni_v4_8_neon: 21.9
put_hevc_epel_uni_v6_8_c: 136.1
put_hevc_epel_uni_v6_8_neon: 19.6
put_hevc_epel_uni_v8_8_c: 222.4
put_hevc_epel_uni_v8_8_neon: 17.1
put_hevc_epel_uni_v12_8_c: 481.6
put_hevc_epel_uni_v12_8_neon: 42.4
put_hevc_epel_uni_v16_8_c: 424.4
put_hevc_epel_uni_v16_8_neon: 63.4
put_hevc_epel_uni_v24_8_c: 1184.1
put_hevc_epel_uni_v24_8_neon: 109.9
put_hevc_epel_uni_v32_8_c: 1401.1
put_hevc_epel_uni_v32_8_neon: 182.9
put_hevc_epel_uni_v48_8_c: 2933.9
put_hevc_epel_uni_v48_8_neon: 388.9
put_hevc_epel_uni_v64_8_c: 5044.9
put_hevc_epel_uni_v64_8_neon: 701.1
put_hevc_epel_v4_8_c: 31.9
put_hevc_epel_v4_8_neon: 13.4
put_hevc_epel_v6_8_c: 95.1
put_hevc_epel_v6_8_neon: 16.4
put_hevc_epel_v8_8_c: 98.9
put_hevc_epel_v8_8_neon: 26.1
put_hevc_epel_v12_8_c: 283.9
put_hevc_epel_v12_8_neon: 36.9
put_hevc_epel_v16_8_c: 229.6
put_hevc_epel_v16_8_neon: 41.9
put_hevc_epel_v24_8_c: 376.4
put_hevc_epel_v24_8_neon: 90.4
put_hevc_epel_v32_8_c: 577.4
put_hevc_epel_v32_8_neon: 188.4
put_hevc_epel_v48_8_c: 1058.4
put_hevc_epel_v48_8_neon: 350.6
put_hevc_epel_v64_8_c: 1647.4
put_hevc_epel_v64_8_neon: 647.9
put_hevc_pel_bi_pixels4_8_c: 39.1
put_hevc_pel_bi_pixels4_8_neon: 36.4
put_hevc_pel_bi_pixels6_8_c: 78.6
put_hevc_pel_bi_pixels6_8_neon: 0.-6
put_hevc_pel_bi_pixels8_8_c: 60.6
put_hevc_pel_bi_pixels8_8_neon: 14.1
put_hevc_pel_bi_pixels12_8_c: 186.1
put_hevc_pel_bi_pixels12_8_neon: 30.4
put_hevc_pel_bi_pixels16_8_c: 231.9
put_hevc_pel_bi_pixels16_8_neon: 32.1
put_hevc_pel_bi_pixels24_8_c: 454.1
put_hevc_pel_bi_pixels24_8_neon: 70.1
put_hevc_pel_bi_pixels32_8_c: 774.1
put_hevc_pel_bi_pixels32_8_neon: 102.1
put_hevc_pel_bi_pixels48_8_c: 1632.9
put_hevc_pel_bi_pixels48_8_neon: 220.4
put_hevc_pel_bi_pixels64_8_c: 2812.9
put_hevc_pel_bi_pixels64_8_neon: 402.4
put_hevc_pel_pixels4_8_c: 41.1
put_hevc_pel_pixels4_8_neon: 6.4
put_hevc_pel_pixels6_8_c: 45.1
put_hevc_pel_pixels6_8_neon: 5.4
put_hevc_pel_pixels8_8_c: 94.6
put_hevc_pel_pixels8_8_neon: 15.6
put_hevc_pel_pixels12_8_c: 198.6
put_hevc_pel_pixels12_8_neon: 15.4
put_hevc_pel_pixels16_8_c: 87.9
put_hevc_pel_pixels16_8_neon: 18.1
put_hevc_pel_pixels24_8_c: 310.6
put_hevc_pel_pixels24_8_neon: 39.6
put_hevc_pel_pixels32_8_c: 198.6
put_hevc_pel_pixels32_8_neon: 78.1
put_hevc_pel_pixels48_8_c: 372.4
put_hevc_pel_pixels48_8_neon: 173.1
put_hevc_pel_pixels64_8_c: 569.1
put_hevc_pel_pixels64_8_neon: 324.4
put_hevc_qpel_bi_h4_8_c: 101.4
put_hevc_qpel_bi_h4_8_neon: 34.6
put_hevc_qpel_bi_h6_8_c: 270.1
put_hevc_qpel_bi_h6_8_neon: 61.6
put_hevc_qpel_bi_h8_8_c: 165.6
put_hevc_qpel_bi_h8_8_neon: 62.9
put_hevc_qpel_bi_h12_8_c: 546.4
put_hevc_qpel_bi_h12_8_neon: 124.1
put_hevc_qpel_bi_h16_8_c: 536.9
put_hevc_qpel_bi_h16_8_neon: 178.6
put_hevc_qpel_bi_h24_8_c: 1151.6
put_hevc_qpel_bi_h24_8_neon: 316.6
put_hevc_qpel_bi_h32_8_c: 1981.4
put_hevc_qpel_bi_h32_8_neon: 575.4
put_hevc_qpel_bi_h48_8_c: 4336.6
put_hevc_qpel_bi_h48_8_neon: 1189.6
put_hevc_qpel_bi_h64_8_c: 7591.6
put_hevc_qpel_bi_h64_8_neon: 2184.9
put_hevc_qpel_bi_hv4_8_c: 438.9
put_hevc_qpel_bi_hv4_8_neon: 97.6
put_hevc_qpel_bi_hv6_8_c: 829.1
put_hevc_qpel_bi_hv6_8_neon: 131.4
put_hevc_qpel_bi_hv8_8_c: 983.9
put_hevc_qpel_bi_hv8_8_neon: 146.1
put_hevc_qpel_bi_hv12_8_c: 2050.9
put_hevc_qpel_bi_hv12_8_neon: 364.6
put_hevc_qpel_bi_hv16_8_c: 3028.4
put_hevc_qpel_bi_hv16_8_neon: 432.6
put_hevc_qpel_bi_hv24_8_c: 6294.9
put_hevc_qpel_bi_hv24_8_neon: 910.1
put_hevc_qpel_bi_hv32_8_c: 10583.4
put_hevc_qpel_bi_hv32_8_neon: 1345.9
put_hevc_qpel_bi_hv48_8_c: 22412.4
put_hevc_qpel_bi_hv48_8_neon: 2852.6
put_hevc_qpel_bi_hv64_8_c: 38653.9
put_hevc_qpel_bi_hv64_8_neon: 5094.1
put_hevc_qpel_bi_v4_8_c: 143.9
put_hevc_qpel_bi_v4_8_neon: 25.9
put_hevc_qpel_bi_v6_8_c: 296.6
put_hevc_qpel_bi_v6_8_neon: 35.1
put_hevc_qpel_bi_v8_8_c: 515.4
put_hevc_qpel_bi_v8_8_neon: 31.6
put_hevc_qpel_bi_v12_8_c: 1175.6
put_hevc_qpel_bi_v12_8_neon: 81.1
put_hevc_qpel_bi_v16_8_c: 2051.6
put_hevc_qpel_bi_v16_8_neon: 111.1
put_hevc_qpel_bi_v24_8_c: 4556.9
put_hevc_qpel_bi_v24_8_neon: 208.6
put_hevc_qpel_bi_v32_8_c: 8048.1
put_hevc_qpel_bi_v32_8_neon: 351.6
put_hevc_qpel_bi_v48_8_c: 18009.9
put_hevc_qpel_bi_v48_8_neon: 773.1
put_hevc_qpel_bi_v64_8_c: 31784.9
put_hevc_qpel_bi_v64_8_neon: 1370.6
put_hevc_qpel_h4_8_c: 120.1
put_hevc_qpel_h4_8_neon: 33.1
put_hevc_qpel_h6_8_c: 241.6
put_hevc_qpel_h6_8_neon: 29.1
put_hevc_qpel_h8_8_c: 70.6
put_hevc_qpel_h8_8_neon: 52.6
put_hevc_qpel_h12_8_c: 347.4
put_hevc_qpel_h12_8_neon: 111.1
put_hevc_qpel_h16_8_c: 180.4
put_hevc_qpel_h16_8_neon: 149.9
put_hevc_qpel_h24_8_c: 333.4
put_hevc_qpel_h24_8_neon: 289.1
put_hevc_qpel_h32_8_c: 597.1
put_hevc_qpel_h32_8_neon: 478.9
put_hevc_qpel_h48_8_c: 1262.6
put_hevc_qpel_h48_8_neon: 975.6
put_hevc_qpel_h64_8_c: 2212.4
put_hevc_qpel_h64_8_neon: 1831.9
put_hevc_qpel_hv4_8_c: 430.9
put_hevc_qpel_hv4_8_neon: 77.4
put_hevc_qpel_hv6_8_c: 785.9
put_hevc_qpel_hv6_8_neon: 122.9
put_hevc_qpel_hv8_8_c: 921.9
put_hevc_qpel_hv8_8_neon: 150.1
put_hevc_qpel_hv12_8_c: 1943.4
put_hevc_qpel_hv12_8_neon: 245.4
put_hevc_qpel_hv16_8_c: 2886.9
put_hevc_qpel_hv16_8_neon: 375.4
put_hevc_qpel_hv24_8_c: 5954.6
put_hevc_qpel_hv24_8_neon: 711.4
put_hevc_qpel_hv32_8_c: 9967.1
put_hevc_qpel_hv32_8_neon: 1161.1
put_hevc_qpel_hv48_8_c: 21173.1
put_hevc_qpel_hv48_8_neon: 2593.9
put_hevc_qpel_hv64_8_c: 37378.1
put_hevc_qpel_hv64_8_neon: 4470.4
put_hevc_qpel_uni_h4_8_c: 108.4
put_hevc_qpel_uni_h4_8_neon: 38.9
put_hevc_qpel_uni_h6_8_c: 237.9
put_hevc_qpel_uni_h6_8_neon: 54.6
put_hevc_qpel_uni_h8_8_c: 432.4
put_hevc_qpel_uni_h8_8_neon: 64.9
put_hevc_qpel_uni_h12_8_c: 1019.4
put_hevc_qpel_uni_h12_8_neon: 116.1
put_hevc_qpel_uni_h16_8_c: 463.6
put_hevc_qpel_uni_h16_8_neon: 153.1
put_hevc_qpel_uni_h24_8_c: 1919.4
put_hevc_qpel_uni_h24_8_neon: 292.1
put_hevc_qpel_uni_h32_8_c: 1800.6
put_hevc_qpel_uni_h32_8_neon: 496.9
put_hevc_qpel_uni_h48_8_c: 4056.1
put_hevc_qpel_uni_h48_8_neon: 1071.1
put_hevc_qpel_uni_h64_8_c: 7149.9
put_hevc_qpel_uni_h64_8_neon: 1820.6
put_hevc_qpel_uni_hv4_8_c: 444.6
put_hevc_qpel_uni_hv4_8_neon: 86.6
put_hevc_qpel_uni_hv6_8_c: 810.6
put_hevc_qpel_uni_hv6_8_neon: 121.9
put_hevc_qpel_uni_hv8_8_c: 949.6
put_hevc_qpel_uni_hv8_8_neon: 137.6
put_hevc_qpel_uni_hv12_8_c: 2021.6
put_hevc_qpel_uni_hv12_8_neon: 261.1
put_hevc_qpel_uni_hv16_8_c: 3004.6
put_hevc_qpel_uni_hv16_8_neon: 367.1
put_hevc_qpel_uni_hv24_8_c: 6204.9
put_hevc_qpel_uni_hv24_8_neon: 813.1
put_hevc_qpel_uni_hv32_8_c: 10447.4
put_hevc_qpel_uni_hv32_8_neon: 1216.4
put_hevc_qpel_uni_hv48_8_c: 22322.9
put_hevc_qpel_uni_hv48_8_neon: 2531.6
put_hevc_qpel_uni_hv64_8_c: 38859.9
put_hevc_qpel_uni_hv64_8_neon: 4528.9
put_hevc_qpel_uni_v4_8_c: 124.6
put_hevc_qpel_uni_v4_8_neon: 33.9
put_hevc_qpel_uni_v6_8_c: 260.6
put_hevc_qpel_uni_v6_8_neon: 28.6
put_hevc_qpel_uni_v8_8_c: 480.4
put_hevc_qpel_uni_v8_8_neon: 30.4
put_hevc_qpel_uni_v12_8_c: 1101.4
put_hevc_qpel_uni_v12_8_neon: 72.1
put_hevc_qpel_uni_v16_8_c: 720.4
put_hevc_qpel_uni_v16_8_neon: 87.4
put_hevc_qpel_uni_v24_8_c: 2443.4
put_hevc_qpel_uni_v24_8_neon: 253.9
put_hevc_qpel_uni_v32_8_c: 2328.6
put_hevc_qpel_uni_v32_8_neon: 311.4
put_hevc_qpel_uni_v48_8_c: 4856.9
put_hevc_qpel_uni_v48_8_neon: 692.6
put_hevc_qpel_uni_v64_8_c: 8169.9
put_hevc_qpel_uni_v64_8_neon: 1203.4
put_hevc_qpel_v4_8_c: 123.6
put_hevc_qpel_v4_8_neon: 26.1
put_hevc_qpel_v6_8_c: 259.9
put_hevc_qpel_v6_8_neon: 22.6
put_hevc_qpel_v8_8_c: 197.4
put_hevc_qpel_v8_8_neon: 24.9
put_hevc_qpel_v12_8_c: 561.4
put_hevc_qpel_v12_8_neon: 53.6
put_hevc_qpel_v16_8_c: 474.9
put_hevc_qpel_v16_8_neon: 75.4
put_hevc_qpel_v24_8_c: 799.9
put_hevc_qpel_v24_8_neon: 159.1
put_hevc_qpel_v32_8_c: 1214.1
put_hevc_qpel_v32_8_neon: 267.9
put_hevc_qpel_v48_8_c: 2217.6
put_hevc_qpel_v48_8_neon: 639.1
put_hevc_qpel_v64_8_c: 3495.4
put_hevc_qpel_v64_8_neon: 1081.1

Signed-off-by: Josh Dekker <josh@itanimul.li>
---
 libavcodec/aarch64/Makefile               |    4 +-
 libavcodec/aarch64/hevcdsp_epel_neon.S    | 3931 ++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  118 +
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 5646 +++++++++++++++++++++
 4 files changed, 9698 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/aarch64/hevcdsp_epel_neon.S
 create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S

Comments

chen April 29, 2021, 1:40 a.m. UTC | #1
inline comment with prefix [MC]

At 2021-04-29 03:50:26, "Josh Dekker" <josh@itanimul.li> wrote:
>From: Rafal Dabrowa <fatwildcat@gmail.com>
>
>Benchmarked on Apple M1:
>
>put_hevc_epel_bi_h4_8_c: 69.9
>put_hevc_epel_bi_h4_8_neon: 15.4
>put_hevc_epel_bi_h6_8_c: 137.1
>put_hevc_epel_bi_h6_8_neon: 31.9
>put_hevc_epel_bi_h8_8_c: 124.6
>put_hevc_epel_bi_h8_8_neon: 40.9
>put_hevc_epel_bi_h12_8_c: 331.9
>put_hevc_epel_bi_h12_8_neon: 72.4
>put_hevc_epel_bi_h16_8_c: 383.4
>put_hevc_epel_bi_h16_8_neon: 124.9
>put_hevc_epel_bi_h24_8_c: 771.6
>put_hevc_epel_bi_h24_8_neon: 209.6
>put_hevc_epel_bi_h32_8_c: 1324.4
>put_hevc_epel_bi_h32_8_neon: 389.4
>put_hevc_epel_bi_h48_8_c: 2869.6
>put_hevc_epel_bi_h48_8_neon: 730.1
>put_hevc_epel_bi_h64_8_c: 4992.6
>put_hevc_epel_bi_h64_8_neon: 1490.4
>put_hevc_epel_bi_hv4_8_c: 163.4
>put_hevc_epel_bi_hv4_8_neon: 38.4
>put_hevc_epel_bi_hv6_8_c: 292.4
>put_hevc_epel_bi_hv6_8_neon: 66.4
>put_hevc_epel_bi_hv8_8_c: 375.6
>put_hevc_epel_bi_hv8_8_neon: 62.4
>put_hevc_epel_bi_hv12_8_c: 831.6
>put_hevc_epel_bi_hv12_8_neon: 134.9
>put_hevc_epel_bi_hv16_8_c: 1257.9
>put_hevc_epel_bi_hv16_8_neon: 214.1
>put_hevc_epel_bi_hv24_8_c: 2666.6
>put_hevc_epel_bi_hv24_8_neon: 391.1
>put_hevc_epel_bi_hv32_8_c: 4722.4
>put_hevc_epel_bi_hv32_8_neon: 734.1
>put_hevc_epel_bi_hv48_8_c: 10100.4
>put_hevc_epel_bi_hv48_8_neon: 1570.4
>put_hevc_epel_bi_hv64_8_c: 17613.4
>put_hevc_epel_bi_hv64_8_neon: 2810.6
>put_hevc_epel_bi_v4_8_c: 77.4
>put_hevc_epel_bi_v4_8_neon: 18.6
>put_hevc_epel_bi_v6_8_c: 142.1
>put_hevc_epel_bi_v6_8_neon: 27.1
>put_hevc_epel_bi_v8_8_c: 192.9
>put_hevc_epel_bi_v8_8_neon: 9.1
>put_hevc_epel_bi_v12_8_c: 415.6
>put_hevc_epel_bi_v12_8_neon: 55.6
>put_hevc_epel_bi_v16_8_c: 487.6
>put_hevc_epel_bi_v16_8_neon: 61.9
>put_hevc_epel_bi_v24_8_c: 957.4
>put_hevc_epel_bi_v24_8_neon: 131.1
>put_hevc_epel_bi_v32_8_c: 1540.4
>put_hevc_epel_bi_v32_8_neon: 210.4
>put_hevc_epel_bi_v48_8_c: 3242.9
>put_hevc_epel_bi_v48_8_neon: 465.6
>put_hevc_epel_bi_v64_8_c: 5441.1
>put_hevc_epel_bi_v64_8_neon: 818.1
>put_hevc_epel_h4_8_c: 41.6
>put_hevc_epel_h4_8_neon: 8.4
>put_hevc_epel_h6_8_c: 110.1
>put_hevc_epel_h6_8_neon: 24.4
>put_hevc_epel_h8_8_c: 41.6
>put_hevc_epel_h8_8_neon: 17.6
>put_hevc_epel_h12_8_c: 183.1
>put_hevc_epel_h12_8_neon: 58.1
>put_hevc_epel_h16_8_c: 146.6
>put_hevc_epel_h16_8_neon: 83.4
>put_hevc_epel_h24_8_c: 240.4
>put_hevc_epel_h24_8_neon: 157.1
>put_hevc_epel_h32_8_c: 431.1
>put_hevc_epel_h32_8_neon: 292.1
>put_hevc_epel_h48_8_c: 858.6
>put_hevc_epel_h48_8_neon: 557.4
>put_hevc_epel_h64_8_c: 1536.6
>put_hevc_epel_h64_8_neon: 1116.6
>put_hevc_epel_hv4_8_c: 152.6
>put_hevc_epel_hv4_8_neon: 34.9
>put_hevc_epel_hv6_8_c: 269.6
>put_hevc_epel_hv6_8_neon: 61.6
>put_hevc_epel_hv8_8_c: 307.4
>put_hevc_epel_hv8_8_neon: 76.9
>put_hevc_epel_hv12_8_c: 702.6
>put_hevc_epel_hv12_8_neon: 113.1
>put_hevc_epel_hv16_8_c: 1081.4
>put_hevc_epel_hv16_8_neon: 190.6
>put_hevc_epel_hv24_8_c: 2276.1
>put_hevc_epel_hv24_8_neon: 345.1
>put_hevc_epel_hv32_8_c: 4068.6
>put_hevc_epel_hv32_8_neon: 780.4
>put_hevc_epel_hv48_8_c: 8754.1
>put_hevc_epel_hv48_8_neon: 1394.4
>put_hevc_epel_hv64_8_c: 15402.1
>put_hevc_epel_hv64_8_neon: 2616.6
>put_hevc_epel_uni_hv4_8_c: 142.1
>put_hevc_epel_uni_hv4_8_neon: 46.6
>put_hevc_epel_uni_hv6_8_c: 298.4
>put_hevc_epel_uni_hv6_8_neon: 72.4
>put_hevc_epel_uni_hv8_8_c: 352.9
>put_hevc_epel_uni_hv8_8_neon: 75.1
>put_hevc_epel_uni_hv12_8_c: 776.6
>put_hevc_epel_uni_hv12_8_neon: 125.9
>put_hevc_epel_uni_hv16_8_c: 1216.1
>put_hevc_epel_uni_hv16_8_neon: 199.1
>put_hevc_epel_uni_hv24_8_c: 2577.9
>put_hevc_epel_uni_hv24_8_neon: 386.6
>put_hevc_epel_uni_hv32_8_c: 4554.9
>put_hevc_epel_uni_hv32_8_neon: 710.9
>put_hevc_epel_uni_hv48_8_c: 9869.1
>put_hevc_epel_uni_hv48_8_neon: 1499.4
>put_hevc_epel_uni_hv64_8_c: 17307.1
>put_hevc_epel_uni_hv64_8_neon: 2750.6
>put_hevc_epel_uni_v4_8_c: 59.9
>put_hevc_epel_uni_v4_8_neon: 21.9
>put_hevc_epel_uni_v6_8_c: 136.1
>put_hevc_epel_uni_v6_8_neon: 19.6
>put_hevc_epel_uni_v8_8_c: 222.4
>put_hevc_epel_uni_v8_8_neon: 17.1
>put_hevc_epel_uni_v12_8_c: 481.6
>put_hevc_epel_uni_v12_8_neon: 42.4
>put_hevc_epel_uni_v16_8_c: 424.4
>put_hevc_epel_uni_v16_8_neon: 63.4
>put_hevc_epel_uni_v24_8_c: 1184.1
>put_hevc_epel_uni_v24_8_neon: 109.9
>put_hevc_epel_uni_v32_8_c: 1401.1
>put_hevc_epel_uni_v32_8_neon: 182.9
>put_hevc_epel_uni_v48_8_c: 2933.9
>put_hevc_epel_uni_v48_8_neon: 388.9
>put_hevc_epel_uni_v64_8_c: 5044.9
>put_hevc_epel_uni_v64_8_neon: 701.1
>put_hevc_epel_v4_8_c: 31.9
>put_hevc_epel_v4_8_neon: 13.4
>put_hevc_epel_v6_8_c: 95.1
>put_hevc_epel_v6_8_neon: 16.4
>put_hevc_epel_v8_8_c: 98.9
>put_hevc_epel_v8_8_neon: 26.1
>put_hevc_epel_v12_8_c: 283.9
>put_hevc_epel_v12_8_neon: 36.9
>put_hevc_epel_v16_8_c: 229.6
>put_hevc_epel_v16_8_neon: 41.9
>put_hevc_epel_v24_8_c: 376.4
>put_hevc_epel_v24_8_neon: 90.4
>put_hevc_epel_v32_8_c: 577.4
>put_hevc_epel_v32_8_neon: 188.4
>put_hevc_epel_v48_8_c: 1058.4
>put_hevc_epel_v48_8_neon: 350.6
>put_hevc_epel_v64_8_c: 1647.4
>put_hevc_epel_v64_8_neon: 647.9
>put_hevc_pel_bi_pixels4_8_c: 39.1
>put_hevc_pel_bi_pixels4_8_neon: 36.4
>put_hevc_pel_bi_pixels6_8_c: 78.6
>put_hevc_pel_bi_pixels6_8_neon: 0.-6
>put_hevc_pel_bi_pixels8_8_c: 60.6
>put_hevc_pel_bi_pixels8_8_neon: 14.1
>put_hevc_pel_bi_pixels12_8_c: 186.1
>put_hevc_pel_bi_pixels12_8_neon: 30.4
>put_hevc_pel_bi_pixels16_8_c: 231.9
>put_hevc_pel_bi_pixels16_8_neon: 32.1
>put_hevc_pel_bi_pixels24_8_c: 454.1
>put_hevc_pel_bi_pixels24_8_neon: 70.1
>put_hevc_pel_bi_pixels32_8_c: 774.1
>put_hevc_pel_bi_pixels32_8_neon: 102.1
>put_hevc_pel_bi_pixels48_8_c: 1632.9
>put_hevc_pel_bi_pixels48_8_neon: 220.4
>put_hevc_pel_bi_pixels64_8_c: 2812.9
>put_hevc_pel_bi_pixels64_8_neon: 402.4
>put_hevc_pel_pixels4_8_c: 41.1
>put_hevc_pel_pixels4_8_neon: 6.4
>put_hevc_pel_pixels6_8_c: 45.1
>put_hevc_pel_pixels6_8_neon: 5.4
>put_hevc_pel_pixels8_8_c: 94.6
>put_hevc_pel_pixels8_8_neon: 15.6
>put_hevc_pel_pixels12_8_c: 198.6
>put_hevc_pel_pixels12_8_neon: 15.4
>put_hevc_pel_pixels16_8_c: 87.9
>put_hevc_pel_pixels16_8_neon: 18.1
>put_hevc_pel_pixels24_8_c: 310.6
>put_hevc_pel_pixels24_8_neon: 39.6
>put_hevc_pel_pixels32_8_c: 198.6
>put_hevc_pel_pixels32_8_neon: 78.1
>put_hevc_pel_pixels48_8_c: 372.4
>put_hevc_pel_pixels48_8_neon: 173.1
>put_hevc_pel_pixels64_8_c: 569.1
>put_hevc_pel_pixels64_8_neon: 324.4
>put_hevc_qpel_bi_h4_8_c: 101.4
>put_hevc_qpel_bi_h4_8_neon: 34.6
>put_hevc_qpel_bi_h6_8_c: 270.1
>put_hevc_qpel_bi_h6_8_neon: 61.6
>put_hevc_qpel_bi_h8_8_c: 165.6
>put_hevc_qpel_bi_h8_8_neon: 62.9
>put_hevc_qpel_bi_h12_8_c: 546.4
>put_hevc_qpel_bi_h12_8_neon: 124.1
>put_hevc_qpel_bi_h16_8_c: 536.9
>put_hevc_qpel_bi_h16_8_neon: 178.6
>put_hevc_qpel_bi_h24_8_c: 1151.6
>put_hevc_qpel_bi_h24_8_neon: 316.6
>put_hevc_qpel_bi_h32_8_c: 1981.4
>put_hevc_qpel_bi_h32_8_neon: 575.4
>put_hevc_qpel_bi_h48_8_c: 4336.6
>put_hevc_qpel_bi_h48_8_neon: 1189.6
>put_hevc_qpel_bi_h64_8_c: 7591.6
>put_hevc_qpel_bi_h64_8_neon: 2184.9
>put_hevc_qpel_bi_hv4_8_c: 438.9
>put_hevc_qpel_bi_hv4_8_neon: 97.6
>put_hevc_qpel_bi_hv6_8_c: 829.1
>put_hevc_qpel_bi_hv6_8_neon: 131.4
>put_hevc_qpel_bi_hv8_8_c: 983.9
>put_hevc_qpel_bi_hv8_8_neon: 146.1
>put_hevc_qpel_bi_hv12_8_c: 2050.9
>put_hevc_qpel_bi_hv12_8_neon: 364.6
>put_hevc_qpel_bi_hv16_8_c: 3028.4
>put_hevc_qpel_bi_hv16_8_neon: 432.6
>put_hevc_qpel_bi_hv24_8_c: 6294.9
>put_hevc_qpel_bi_hv24_8_neon: 910.1
>put_hevc_qpel_bi_hv32_8_c: 10583.4
>put_hevc_qpel_bi_hv32_8_neon: 1345.9
>put_hevc_qpel_bi_hv48_8_c: 22412.4
>put_hevc_qpel_bi_hv48_8_neon: 2852.6
>put_hevc_qpel_bi_hv64_8_c: 38653.9
>put_hevc_qpel_bi_hv64_8_neon: 5094.1
>put_hevc_qpel_bi_v4_8_c: 143.9
>put_hevc_qpel_bi_v4_8_neon: 25.9
>put_hevc_qpel_bi_v6_8_c: 296.6
>put_hevc_qpel_bi_v6_8_neon: 35.1
>put_hevc_qpel_bi_v8_8_c: 515.4
>put_hevc_qpel_bi_v8_8_neon: 31.6
>put_hevc_qpel_bi_v12_8_c: 1175.6
>put_hevc_qpel_bi_v12_8_neon: 81.1
>put_hevc_qpel_bi_v16_8_c: 2051.6
>put_hevc_qpel_bi_v16_8_neon: 111.1
>put_hevc_qpel_bi_v24_8_c: 4556.9
>put_hevc_qpel_bi_v24_8_neon: 208.6
>put_hevc_qpel_bi_v32_8_c: 8048.1
>put_hevc_qpel_bi_v32_8_neon: 351.6
>put_hevc_qpel_bi_v48_8_c: 18009.9
>put_hevc_qpel_bi_v48_8_neon: 773.1
>put_hevc_qpel_bi_v64_8_c: 31784.9
>put_hevc_qpel_bi_v64_8_neon: 1370.6
>put_hevc_qpel_h4_8_c: 120.1
>put_hevc_qpel_h4_8_neon: 33.1
>put_hevc_qpel_h6_8_c: 241.6
>put_hevc_qpel_h6_8_neon: 29.1
>put_hevc_qpel_h8_8_c: 70.6
>put_hevc_qpel_h8_8_neon: 52.6
>put_hevc_qpel_h12_8_c: 347.4
>put_hevc_qpel_h12_8_neon: 111.1
>put_hevc_qpel_h16_8_c: 180.4
>put_hevc_qpel_h16_8_neon: 149.9
>put_hevc_qpel_h24_8_c: 333.4
>put_hevc_qpel_h24_8_neon: 289.1
>put_hevc_qpel_h32_8_c: 597.1
>put_hevc_qpel_h32_8_neon: 478.9
>put_hevc_qpel_h48_8_c: 1262.6
>put_hevc_qpel_h48_8_neon: 975.6
>put_hevc_qpel_h64_8_c: 2212.4
>put_hevc_qpel_h64_8_neon: 1831.9
>put_hevc_qpel_hv4_8_c: 430.9
>put_hevc_qpel_hv4_8_neon: 77.4
>put_hevc_qpel_hv6_8_c: 785.9
>put_hevc_qpel_hv6_8_neon: 122.9
>put_hevc_qpel_hv8_8_c: 921.9
>put_hevc_qpel_hv8_8_neon: 150.1
>put_hevc_qpel_hv12_8_c: 1943.4
>put_hevc_qpel_hv12_8_neon: 245.4
>put_hevc_qpel_hv16_8_c: 2886.9
>put_hevc_qpel_hv16_8_neon: 375.4
>put_hevc_qpel_hv24_8_c: 5954.6
>put_hevc_qpel_hv24_8_neon: 711.4
>put_hevc_qpel_hv32_8_c: 9967.1
>put_hevc_qpel_hv32_8_neon: 1161.1
>put_hevc_qpel_hv48_8_c: 21173.1
>put_hevc_qpel_hv48_8_neon: 2593.9
>put_hevc_qpel_hv64_8_c: 37378.1
>put_hevc_qpel_hv64_8_neon: 4470.4
>put_hevc_qpel_uni_h4_8_c: 108.4
>put_hevc_qpel_uni_h4_8_neon: 38.9
>put_hevc_qpel_uni_h6_8_c: 237.9
>put_hevc_qpel_uni_h6_8_neon: 54.6
>put_hevc_qpel_uni_h8_8_c: 432.4
>put_hevc_qpel_uni_h8_8_neon: 64.9
>put_hevc_qpel_uni_h12_8_c: 1019.4
>put_hevc_qpel_uni_h12_8_neon: 116.1
>put_hevc_qpel_uni_h16_8_c: 463.6
>put_hevc_qpel_uni_h16_8_neon: 153.1
>put_hevc_qpel_uni_h24_8_c: 1919.4
>put_hevc_qpel_uni_h24_8_neon: 292.1
>put_hevc_qpel_uni_h32_8_c: 1800.6
>put_hevc_qpel_uni_h32_8_neon: 496.9
>put_hevc_qpel_uni_h48_8_c: 4056.1
>put_hevc_qpel_uni_h48_8_neon: 1071.1
>put_hevc_qpel_uni_h64_8_c: 7149.9
>put_hevc_qpel_uni_h64_8_neon: 1820.6
>put_hevc_qpel_uni_hv4_8_c: 444.6
>put_hevc_qpel_uni_hv4_8_neon: 86.6
>put_hevc_qpel_uni_hv6_8_c: 810.6
>put_hevc_qpel_uni_hv6_8_neon: 121.9
>put_hevc_qpel_uni_hv8_8_c: 949.6
>put_hevc_qpel_uni_hv8_8_neon: 137.6
>put_hevc_qpel_uni_hv12_8_c: 2021.6
>put_hevc_qpel_uni_hv12_8_neon: 261.1
>put_hevc_qpel_uni_hv16_8_c: 3004.6
>put_hevc_qpel_uni_hv16_8_neon: 367.1
>put_hevc_qpel_uni_hv24_8_c: 6204.9
>put_hevc_qpel_uni_hv24_8_neon: 813.1
>put_hevc_qpel_uni_hv32_8_c: 10447.4
>put_hevc_qpel_uni_hv32_8_neon: 1216.4
>put_hevc_qpel_uni_hv48_8_c: 22322.9
>put_hevc_qpel_uni_hv48_8_neon: 2531.6
>put_hevc_qpel_uni_hv64_8_c: 38859.9
>put_hevc_qpel_uni_hv64_8_neon: 4528.9
>put_hevc_qpel_uni_v4_8_c: 124.6
>put_hevc_qpel_uni_v4_8_neon: 33.9
>put_hevc_qpel_uni_v6_8_c: 260.6
>put_hevc_qpel_uni_v6_8_neon: 28.6
>put_hevc_qpel_uni_v8_8_c: 480.4
>put_hevc_qpel_uni_v8_8_neon: 30.4
>put_hevc_qpel_uni_v12_8_c: 1101.4
>put_hevc_qpel_uni_v12_8_neon: 72.1
>put_hevc_qpel_uni_v16_8_c: 720.4
>put_hevc_qpel_uni_v16_8_neon: 87.4
>put_hevc_qpel_uni_v24_8_c: 2443.4
>put_hevc_qpel_uni_v24_8_neon: 253.9
>put_hevc_qpel_uni_v32_8_c: 2328.6
>put_hevc_qpel_uni_v32_8_neon: 311.4
>put_hevc_qpel_uni_v48_8_c: 4856.9
>put_hevc_qpel_uni_v48_8_neon: 692.6
>put_hevc_qpel_uni_v64_8_c: 8169.9
>put_hevc_qpel_uni_v64_8_neon: 1203.4
>put_hevc_qpel_v4_8_c: 123.6
>put_hevc_qpel_v4_8_neon: 26.1
>put_hevc_qpel_v6_8_c: 259.9
>put_hevc_qpel_v6_8_neon: 22.6
>put_hevc_qpel_v8_8_c: 197.4
>put_hevc_qpel_v8_8_neon: 24.9
>put_hevc_qpel_v12_8_c: 561.4
>put_hevc_qpel_v12_8_neon: 53.6
>put_hevc_qpel_v16_8_c: 474.9
>put_hevc_qpel_v16_8_neon: 75.4
>put_hevc_qpel_v24_8_c: 799.9
>put_hevc_qpel_v24_8_neon: 159.1
>put_hevc_qpel_v32_8_c: 1214.1
>put_hevc_qpel_v32_8_neon: 267.9
>put_hevc_qpel_v48_8_c: 2217.6
>put_hevc_qpel_v48_8_neon: 639.1
>put_hevc_qpel_v64_8_c: 3495.4
>put_hevc_qpel_v64_8_neon: 1081.1
>
>Signed-off-by: Josh Dekker <josh@itanimul.li>
>---
> libavcodec/aarch64/Makefile               |    4 +-
> libavcodec/aarch64/hevcdsp_epel_neon.S    | 3931 ++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c |  118 +
> libavcodec/aarch64/hevcdsp_qpel_neon.S    | 5646 +++++++++++++++++++++
> 4 files changed, 9698 insertions(+), 1 deletion(-)
> create mode 100644 libavcodec/aarch64/hevcdsp_epel_neon.S
> create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S
>
>diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
>index 954461f81d..ebedc03bfa 100644
>--- a/libavcodec/aarch64/Makefile
>+++ b/libavcodec/aarch64/Makefile
>@@ -61,6 +61,8 @@ NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \
>                                            aarch64/vp9lpf_neon.o               \
>                                            aarch64/vp9mc_16bpp_neon.o          \
>                                            aarch64/vp9mc_neon.o
>-NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_idct_neon.o         \
>+NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_epel_neon.o         \
>+                                           aarch64/hevcdsp_idct_neon.o         \
>                                            aarch64/hevcdsp_init_aarch64.o      \
>+                                           aarch64/hevcdsp_qpel_neon.o         \
>                                            aarch64/hevcdsp_sao_neon.o
>diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
>new file mode 100644
>index 0000000000..0366fe8ae3
>--- /dev/null
>+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
>@@ -0,0 +1,3931 @@
>+/* -*-arm64-*-
>+ * vim: syntax=arm64asm
>+ *
>+ * This file is part of FFmpeg.
>+ *
>+ * FFmpeg is free software; you can redistribute it and/or
>+ * modify it under the terms of the GNU Lesser General Public
>+ * License as published by the Free Software Foundation; either
>+ * version 2.1 of the License, or (at your option) any later version.
>+ *
>+ * FFmpeg is distributed in the hope that it will be useful,
>+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
>+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>+ * Lesser General Public License for more details.
>+ *
>+ * You should have received a copy of the GNU Lesser General Public
>+ * License along with FFmpeg; if not, write to the Free Software
>+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>+ */
>+
>+#include "libavutil/aarch64/asm.S"
>+#define MAX_PB_SIZE 64
>+
>+function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1
>+        mov             x7, #(MAX_PB_SIZE * 2)

>+1:      ld1            {v0.s}[0], [x1], x2
[MC] I haven't M1, so I am not sure how about this instruction.
However, in A57 doc, LD1 latency=8, throughput=1, the LD1R latency=5, throughput=1
Moreover, I guess all of interpolate function works on even rows, so we can unroll a little.
Further, we may insert SUB in between LD/ST to avoid pipeline stall, and CBNZ avoid affect flags register


>+        ushll           v4.8h, v0.8b, #6
>+        st1            {v4.d}[0], [x0], x7
>+        subs            x3, x3, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1
>+        mov             x7, #(MAX_PB_SIZE * 2 - 8)
>+1:      ld1            {v0.8b}, [x1], x2
>+        ushll           v4.8h, v0.8b, #6
>+        st1            {v4.d}[0], [x0], #8
>+        st1            {v4.s}[2], [x0], x7
>+        subs            x3, x3, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1
>+        mov             x7, #(MAX_PB_SIZE * 2)
>+1:      ld1            {v0.8b}, [x1], x2
>+        ushll           v4.8h, v0.8b, #6
>+        st1            {v4.8h}, [x0], x7
>+        subs            x3, x3, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1
>+        mov             x7, #(MAX_PB_SIZE * 2 - 16)
>+1:      ld1            {v0.8b, v1.8b}, [x1], x2
>+        ushll           v4.8h, v0.8b, #6
>+        st1            {v4.8h}, [x0], #16
>+        ushll           v5.8h, v1.8b, #6
>+        st1            {v5.d}[0], [x0], x7
>+        subs            x3, x3, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1
>+        mov             x7, #(MAX_PB_SIZE * 2)
>+1:      ld1            {v0.8b, v1.8b}, [x1], x2
>+        ushll           v4.8h, v0.8b, #6
>+        ushll           v5.8h, v1.8b, #6
>+        st1            {v4.8h, v5.8h}, [x0], x7
>+        subs            x3, x3, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1
>+        mov             x7, #(MAX_PB_SIZE * 2)
>+1:      ld1            {v0.8b, v1.8b, v2.8b}, [x1], x2
>+        ushll           v4.8h, v0.8b, #6
>+        ushll           v5.8h, v1.8b, #6
>+        ushll           v6.8h, v2.8b, #6
>+        st1            {v4.8h, v5.8h, v6.8h}, [x0], x7
>+        subs            x3, x3, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1
>+        mov             x7, #(MAX_PB_SIZE * 2)
>+1:      ld1            {v0.8b, v1.8b, v2.8b, v3.8b}, [x1], x2
>+        ushll           v4.8h, v0.8b, #6
>+        ushll           v5.8h, v1.8b, #6
>+        ushll           v6.8h, v2.8b, #6
>+        ushll           v7.8h, v3.8b, #6
>+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x7
>+        subs            x3, x3, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_pixels48_8_neon, export=1
>+        mov             x7, #(MAX_PB_SIZE)
>+1:      ld1            {v0.16b, v1.16b, v2.16b}, [x1], x2
>+        ushll           v4.8h, v0.8b, #6
>+        ushll2          v5.8h, v0.16b, #6
>+        ushll           v6.8h, v1.8b, #6
>+        ushll2          v7.8h, v1.16b, #6
>+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
>+        ushll           v4.8h, v2.8b, #6
>+        ushll2          v5.8h, v2.16b, #6
>+        st1            {v4.8h, v5.8h}, [x0], x7
>+        subs            x3, x3, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
>+1:      ld1            {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
>+        ushll           v4.8h, v0.8b, #6
>+        ushll2          v5.8h, v0.16b, #6
>+        ushll           v6.8h, v1.8b, #6
>+        ushll2          v7.8h, v1.16b, #6
>+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #(MAX_PB_SIZE)
>+        ushll           v4.8h, v2.8b, #6
>+        ushll2          v5.8h, v2.16b, #6
>+        ushll           v6.8h, v3.8b, #6
>+        ushll2          v7.8h, v3.16b, #6
>+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #(MAX_PB_SIZE)
>+        subs            x3, x3, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_bi_pixels4_8_neon, export=1
>+        mov             x10, #(MAX_PB_SIZE * 2)
>+1:      ld1            {v0.s}[0], [x2], x3 // src
>+        ushll           v16.8h, v0.8b, #6
>+        ld1            {v20.4h}, [x4], x10 // src2
>+        sqadd           v16.8h, v16.8h, v20.8h
>+        sqrshrun        v0.8b,  v16.8h, #7
>+        st1            {v0.s}[0], [x0], x1
>+        subs            x5, x5, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_bi_pixels6_8_neon, export=1
>+        mov             x10, #(MAX_PB_SIZE * 2 - 8)
>+        sub             x1, x1, #4
>+1:      ld1            {v0.8b}, [x2], x3
>+        ushll           v16.8h, v0.8b, #6
>+        ld1            {v20.4h}, [x4], #8
>+        ld1            {v20.s}[2], [x4], x10
>+        sqadd           v16.8h, v16.8h, v20.8h
>+        sqrshrun        v0.8b,  v16.8h, #7
>+        st1            {v0.s}[0], [x0], #4
>+        st1            {v0.h}[2], [x0], x1
>+        subs            x5, x5, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_bi_pixels8_8_neon, export=1
>+        mov             x10, #(MAX_PB_SIZE * 2)
>+1:      ld1            {v0.8b}, [x2], x3     // src
>+        ushll           v16.8h, v0.8b, #6
>+        ld1            {v20.8h}, [x4], x10   // src2
>+        sqadd           v16.8h, v16.8h, v20.8h
>+        sqrshrun        v0.8b,  v16.8h, #7
>+        st1            {v0.8b}, [x0], x1
>+        subs            x5, x5, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_bi_pixels12_8_neon, export=1
>+        mov             x10, #(MAX_PB_SIZE * 2 - 16)
>+        sub             x1, x1, #8
>+1:      ld1            {v0.16b}, [x2], x3
>+        ushll           v16.8h, v0.8b, #6
>+        ushll2          v17.8h, v0.16b, #6
>+        ld1            {v20.8h}, [x4], #16
>+        ld1            {v21.4h}, [x4], x10
>+        sqadd           v16.8h, v16.8h, v20.8h
>+        sqadd           v17.8h, v17.8h, v21.8h
>+        sqrshrun        v0.8b,  v16.8h, #7
>+        sqrshrun2       v0.16b, v17.8h, #7
>+        st1            {v0.8b}, [x0], #8
>+        st1            {v0.s}[2], [x0], x1
>+        subs            x5, x5, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_bi_pixels16_8_neon, export=1
>+        mov             x10, #(MAX_PB_SIZE * 2)
>+1:      ld1            {v0.16b}, [x2], x3            // src
>+        ushll           v16.8h, v0.8b, #6
>+        ushll2          v17.8h, v0.16b, #6
>+        ld1            {v20.8h, v21.8h}, [x4], x10   // src2
>+        sqadd           v16.8h, v16.8h, v20.8h
>+        sqadd           v17.8h, v17.8h, v21.8h
>+        sqrshrun        v0.8b,  v16.8h, #7
>+        sqrshrun2       v0.16b, v17.8h, #7
>+        st1            {v0.16b}, [x0], x1
>+        subs            x5, x5, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_bi_pixels24_8_neon, export=1
>+        mov             x10, #(MAX_PB_SIZE * 2)
>+1:      ld1            {v0.8b, v1.8b, v2.8b}, [x2], x3  // src
>+        ushll           v16.8h, v0.8b, #6
>+        ushll           v17.8h, v1.8b, #6
>+        ushll           v18.8h, v2.8b, #6
>+        ld1            {v20.8h, v21.8h, v22.8h}, [x4], x10   // src2
>+        sqadd           v16.8h, v16.8h, v20.8h
>+        sqadd           v17.8h, v17.8h, v21.8h
>+        sqadd           v18.8h, v18.8h, v22.8h
>+        sqrshrun        v0.8b, v16.8h, #7
>+        sqrshrun        v1.8b, v17.8h, #7
>+        sqrshrun        v2.8b, v18.8h, #7
>+        st1            {v0.8b, v1.8b, v2.8b}, [x0], x1
>+        subs            x5, x5, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_bi_pixels32_8_neon, export=1
>+        mov             x10, #(MAX_PB_SIZE * 2)
>+1:      ld1            {v0.16b, v1.16b}, [x2], x3            // src
>+        ushll           v16.8h, v0.8b, #6
>+        ushll2          v17.8h, v0.16b, #6
>+        ushll           v18.8h, v1.8b, #6
>+        ushll2          v19.8h, v1.16b, #6
>+        ld1            {v20.8h, v21.8h, v22.8h, v23.8h}, [x4], x10   // src2
>+        sqadd           v16.8h, v16.8h, v20.8h
>+        sqadd           v17.8h, v17.8h, v21.8h
>+        sqadd           v18.8h, v18.8h, v22.8h
>+        sqadd           v19.8h, v19.8h, v23.8h
>+        sqrshrun        v0.8b,  v16.8h, #7
>+        sqrshrun2       v0.16b, v17.8h, #7
>+        sqrshrun        v1.8b,  v18.8h, #7
>+        sqrshrun2       v1.16b, v19.8h, #7
>+        st1            {v0.16b, v1.16b}, [x0], x1
>+        subs            x5, x5, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_bi_pixels48_8_neon, export=1
>+        mov             x10, #(MAX_PB_SIZE)
>+1:      ld1            {v0.16b, v1.16b, v2.16b}, [x2], x3            // src
>+        ushll           v16.8h, v0.8b, #6
>+        ushll2          v17.8h, v0.16b, #6
>+        ushll           v18.8h, v1.8b, #6
>+        ushll2          v19.8h, v1.16b, #6
>+        ushll           v20.8h, v2.8b, #6
>+        ushll2          v21.8h, v2.16b, #6
>+        ld1            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(MAX_PB_SIZE)        // src2
>+        sqadd           v16.8h, v16.8h, v24.8h
>+        sqadd           v17.8h, v17.8h, v25.8h
>+        sqadd           v18.8h, v18.8h, v26.8h
>+        sqadd           v19.8h, v19.8h, v27.8h
>+        ld1            {v24.8h, v25.8h}, [x4], x10
>+        sqadd           v20.8h, v20.8h, v24.8h
>+        sqadd           v21.8h, v21.8h, v25.8h
>+        sqrshrun        v0.8b, v16.8h, #7
>+        sqrshrun2       v0.16b, v17.8h, #7
>+        sqrshrun        v1.8b, v18.8h, #7
>+        sqrshrun2       v1.16b, v19.8h, #7
>+        sqrshrun        v2.8b, v20.8h, #7
>+        sqrshrun2       v2.16b, v21.8h, #7
>+        st1            {v0.16b, v1.16b, v2.16b}, [x0], x1
>+        subs            x5, x5, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_bi_pixels64_8_neon, export=1
>+1:      ld1            {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3            // src
>+        ushll           v16.8h, v0.8b, #6
>+        ushll2          v17.8h, v0.16b, #6
>+        ushll           v18.8h, v1.8b, #6
>+        ushll2          v19.8h, v1.16b, #6
>+        ushll           v20.8h, v2.8b, #6
>+        ushll2          v21.8h, v2.16b, #6
>+        ushll           v22.8h, v3.8b, #6
>+        ushll2          v23.8h, v3.16b, #6
>+        ld1            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(MAX_PB_SIZE)        // src2
>+        sqadd           v16.8h, v16.8h, v24.8h
>+        sqadd           v17.8h, v17.8h, v25.8h
>+        sqadd           v18.8h, v18.8h, v26.8h
>+        sqadd           v19.8h, v19.8h, v27.8h
>+        ld1            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(MAX_PB_SIZE)
>+        sqadd           v20.8h, v20.8h, v24.8h
>+        sqadd           v21.8h, v21.8h, v25.8h
>+        sqadd           v22.8h, v22.8h, v26.8h
>+        sqadd           v23.8h, v23.8h, v27.8h
>+        sqrshrun        v0.8b, v16.8h, #7
>+        sqrshrun2       v0.16b, v17.8h, #7
>+        sqrshrun        v1.8b, v18.8h, #7
>+        sqrshrun2       v1.16b, v19.8h, #7
>+        sqrshrun        v2.8b, v20.8h, #7
>+        sqrshrun2       v2.16b, v21.8h, #7
>+        sqrshrun        v3.8b, v22.8h, #7
>+        sqrshrun2       v3.16b, v23.8h, #7
>+        st1            {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
>+        subs            x5, x5, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+.Lepel_filters:
>+        .byte  0,  0,  0,  0
>+        .byte -2, 58, 10, -2
>+        .byte -4, 54, 16, -2
>+        .byte -6, 46, 28, -4
>+        .byte -4, 36, 36, -4
>+        .byte -4, 28, 46, -6
>+        .byte -2, 16, 54, -4
>+        .byte -2, 10, 58, -2
>+
>+.macro load_epel_filterb freg, xreg
>+        adr             \xreg, .Lepel_filters
>+        add             \xreg, \xreg, \freg, lsl #2
>+        ld4r           {v0.16b, v1.16b, v2.16b, v3.16b}, [\xreg] // filter
>+        neg             v0.16b, v0.16b

>+        neg             v3.16b, v3.16b
[MC] Why not put abs(x) in the constant table?


>+.endm
>+
>+.macro calc_epelb dst, src1, src2, src3, src4
>+        umlsl           \dst\().8h, \src1\().8b, v0.8b
>+        umlal           \dst\().8h, \src2\().8b, v1.8b
>+        umlal           \dst\().8h, \src3\().8b, v2.8b
>+        umlsl           \dst\().8h, \src4\().8b, v3.8b
>+.endm
>+
>+.macro calc_epelb2 dst, src1, src2, src3, src4
>+        umlsl2          \dst\().8h, \src1\().16b, v0.16b
>+        umlal2          \dst\().8h, \src2\().16b, v1.16b
>+        umlal2          \dst\().8h, \src3\().16b, v2.16b
>+        umlsl2          \dst\().8h, \src4\().16b, v3.16b
>+.endm
>+
>+.macro load_epel_filterh freg, xreg
>+        adr             \xreg, .Lepel_filters
>+        add             \xreg, \xreg, \freg, lsl #2
>+        ld1            {v0.8b}, [\xreg]
>+        sxtl            v0.8h, v0.8b
>+.endm
>+
>+.macro calc_epelh dst, src1, src2, src3, src4
>+        smull           \dst\().4s, \src1\().4h, v0.h[0]
>+        smlal           \dst\().4s, \src2\().4h, v0.h[1]
>+        smlal           \dst\().4s, \src3\().4h, v0.h[2]
>+        smlal           \dst\().4s, \src4\().4h, v0.h[3]
>+        sqshrn          \dst\().4h, \dst\().4s, #6
>+.endm
>+
>+.macro calc_epelh2 dst, tmp, src1, src2, src3, src4
>+        smull2          \tmp\().4s, \src1\().8h, v0.h[0]
>+        smlal2          \tmp\().4s, \src2\().8h, v0.h[1]
>+        smlal2          \tmp\().4s, \src3\().8h, v0.h[2]
>+        smlal2          \tmp\().4s, \src4\().8h, v0.h[3]
>+        sqshrn2         \dst\().8h, \tmp\().4s, #6
>+.endm
>+
>+function ff_hevc_put_hevc_epel_h4_8_neon, export=1
>+        load_epel_filterb x4, x5
>+        sub             x1, x1, #1
>+        mov             x10, #(MAX_PB_SIZE * 2)
>+1:      ld1            {v4.8b}, [x1], x2
>+        ushr            v5.2d, v4.2d, #8
>+        ushr            v6.2d, v5.2d, #8
>+        ushr            v7.2d, v6.2d, #8
>+        movi            v16.8h, #0
>+        calc_epelb      v16, v4, v5, v6, v7
>+        st1            {v16.4h}, [x0], x10
>+        subs            x3, x3, #1   // height
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_epel_h6_8_neon, export=1
>+        load_epel_filterb x4, x5
>+        sub             x1,  x1, #1
>+        sub             x2,  x2, #8
>+        mov             x10, #(MAX_PB_SIZE * 2 - 8)
>+1:      ld1            {v24.8b},  [x1], #8
>+        ushr            v26.2d, v24.2d, #8
>+        ushr            v27.2d, v26.2d, #8

>+        ushr            v28.2d, v27.2d, #8
[MC] Dependency link will made pipeline stall, how about EXT or LD1 directly?


>+        movi            v16.8h,   #0
>+        ld1            {v28.b}[5], [x1], x2
>+        calc_epelb      v16, v24, v26, v27, v28
>+        st1            {v16.4h},   [x0], #8
>+        st1            {v16.s}[2], [x0], x10
>+        subs            x3, x3,   #1   // height
>+        b.ne            1b
>+        ret
>+endfunc
>+

...


>-- 
>2.30.1 (Apple Git-130)
>
>_______________________________________________
>ffmpeg-devel mailing list
>ffmpeg-devel@ffmpeg.org
>https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
>To unsubscribe, visit link above, or email
>ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Martin Storsjö April 30, 2021, 11:46 a.m. UTC | #2
On Wed, 28 Apr 2021, Josh Dekker wrote:

> From: Rafal Dabrowa <fatwildcat@gmail.com>
>

First a couple technical details:

The use of '.ifeqs "\op", "sshr"' needs to be changed into '.ifc \op, 
sshr', because gas-preprocessor doesn't implement '.ifeqs'.

The checkasm tests for hevc_pel that were added in 
9c513edb7999a35ddcc6e3a8d984a96c8fb492a3 aren't actually ever run when you 
run "make fate-checkasm", because they're not added in 
tests/fate/checkasm.mak. They're, IMO, separated way too finegrainedly 
into 10 test groups of hevc_epel, hevc_epel_uni, hevc_epel_uni_w, etc. I'd 
strongly suggest you'd merge them into one single group, hevc_pel, just 
like the file name. That makes it easier to hook up to fate.

(As a side note, the fact that new checkasm tests need to be hooked up in 
fate like this in tests/fate/checkasm.mak is easy to overlook, and the 
benefit from being able to mark one specific group of checkasm as 
known-failure to ignore, is not used very much. So maybe we should 
reconsider to just have one single "fate-checkasm" which runs the full 
suite?)

All the tests crash under checkasm on linux. The reason is that they use 
e.g. x3 for loop counter, while the variable is declared as int. Checkasm 
tests against this by making sure the upper half of such registers are 
filled with garbage, but the wrapper that does that can't be used on apple 
platforms, so that aspect goes untested when developing on a mac. So 
please test checkasm on something else than a mac too.

Then, there's the sheer size of the patch. You said you considered 
templating it - if possible that would be very welcome. I haven't looked 
closely enough to be able to say easily how to do it, if at all though.

Then finally, the code is very very stall prone. It might not be 
measurable on an M1, but is very real on other cores. While you can say "I 
haven't bothered tuning it yet", the thing is, you don't need to actually 
run it on such a core to roughly do much better scheduling. I did one 
initial test on one function and got it sped up by 4% by just adjusting it 
in an initial way. Example:

function ff_hevc_put_hevc_epel_h48_8_neon, export=1
[...]
1:      ld3            {v26.16b, v27.16b, v28.16b}, [x1], x5
         ushr            v29.2d, v26.2d, #8    // Uses v26 which was loaded right before
         ushr            v30.2d, v27.2d, #8
         ushr            v31.2d, v28.2d, #8
         ld1            {v24.s}[0], [x1], x5
         ld1            {v25.s}[0], [x1], x2 // Uses x1 which was updated in the previous instruction
         mov             v29.b[7], v24.b[0] // Uses v24 which was loaded almost right before
         mov             v30.b[7], v24.b[1]
         mov             v31.b[7], v24.b[2]
         mov             v29.b[15], v25.b[0]
         mov             v30.b[15], v25.b[1]
         mov             v31.b[15], v25.b[2]
         movi            v16.8h, #0 // This has no dependencies and could be done anytime, while e.g. waiting for results for the loads above!
         movi            v17.8h, #0
         movi            v18.8h, #0
         movi            v20.8h, #0
         movi            v21.8h, #0
         movi            v22.8h, #0
         calc_epelb      v16, v26, v27, v28, v29
         calc_epelb2     v20, v26, v27, v28, v29
         calc_epelb      v17, v27, v28, v29, v30
         calc_epelb2     v21, v27, v28, v29, v30
         calc_epelb      v18, v28, v29, v30, v31
         calc_epelb2     v22, v28, v29, v30, v31
         st3            {v16.8h, v17.8h, v18.8h}, [x0], #48
         st3            {v20.8h, v21.8h, v22.8h}, [x0], x10
         subs            x3, x3, #1   // This updates the condition flags 
right before the branch, while we could stick it anywhere else in the 
loop, where we have an instruction waiting for the result of the previous 
one
         b.ne            1b
         ret


A trivial rescheduling of it looks like this:

1:      ld3            {v26.16b, v27.16b, v28.16b}, [x1], x5
         movi            v16.8h, #0
         movi            v17.8h, #0
         movi            v18.8h, #0
         ld1            {v24.s}[0], [x1], x5
         movi            v20.8h, #0
         movi            v21.8h, #0
         movi            v22.8h, #0
         ld1            {v25.s}[0], [x1], x2
         ushr            v29.2d, v26.2d, #8
         ushr            v30.2d, v27.2d, #8
         ushr            v31.2d, v28.2d, #8
         mov             v29.b[7], v24.b[0]
         mov             v30.b[7], v24.b[1]
         mov             v31.b[7], v24.b[2]
         mov             v29.b[15], v25.b[0]
         mov             v30.b[15], v25.b[1]
         mov             v31.b[15], v25.b[2]
         calc_epelb      v16, v26, v27, v28, v29
         calc_epelb2     v20, v26, v27, v28, v29
         calc_epelb      v17, v27, v28, v29, v30
         calc_epelb2     v21, v27, v28, v29, v30
         calc_epelb      v18, v28, v29, v30, v31
         calc_epelb2     v22, v28, v29, v30, v31
         st3            {v16.8h, v17.8h, v18.8h}, [x0], #48
         subs            x3, x3, #1   // height
         st3            {v20.8h, v21.8h, v22.8h}, [x0], x10
         b.ne            1b

The first version gave these checkasm numbers for me:

                         Cortex A53      A72      A73
put_hevc_epel_h48_8_neon:   3312.7   2545.7   2961.5
The new version was:
put_hevc_epel_h48_8_neon:   3168.7   2497.0   2842.5

That is a 4% speedup on A53, 1.8% speedup on A72 and 4% speedup on A73. 
(The latter two were quite noisy though so the exact benefit is unknown.)

One doesn't necessarily need to run and test and tune all of it for the 
exact perfect scheduling, but the functions look trivial enough that you 
can improve it a lot like this by just shuffling things around.

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 954461f81d..ebedc03bfa 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -61,6 +61,8 @@  NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \
                                            aarch64/vp9lpf_neon.o               \
                                            aarch64/vp9mc_16bpp_neon.o          \
                                            aarch64/vp9mc_neon.o
-NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_idct_neon.o         \
+NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_epel_neon.o         \
+                                           aarch64/hevcdsp_idct_neon.o         \
                                            aarch64/hevcdsp_init_aarch64.o      \
+                                           aarch64/hevcdsp_qpel_neon.o         \
                                            aarch64/hevcdsp_sao_neon.o
diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
new file mode 100644
index 0000000000..0366fe8ae3
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -0,0 +1,3931 @@ 
+/* -*-arm64-*-
+ * vim: syntax=arm64asm
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#define MAX_PB_SIZE 64
+
+function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1
+        mov             x7, #(MAX_PB_SIZE * 2)
+1:      ld1            {v0.s}[0], [x1], x2
+        ushll           v4.8h, v0.8b, #6
+        st1            {v4.d}[0], [x0], x7
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1
+        mov             x7, #(MAX_PB_SIZE * 2 - 8)
+1:      ld1            {v0.8b}, [x1], x2
+        ushll           v4.8h, v0.8b, #6
+        st1            {v4.d}[0], [x0], #8
+        st1            {v4.s}[2], [x0], x7
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1
+        mov             x7, #(MAX_PB_SIZE * 2)
+1:      ld1            {v0.8b}, [x1], x2
+        ushll           v4.8h, v0.8b, #6
+        st1            {v4.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1
+        mov             x7, #(MAX_PB_SIZE * 2 - 16)
+1:      ld1            {v0.8b, v1.8b}, [x1], x2
+        ushll           v4.8h, v0.8b, #6
+        st1            {v4.8h}, [x0], #16
+        ushll           v5.8h, v1.8b, #6
+        st1            {v5.d}[0], [x0], x7
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1
+        mov             x7, #(MAX_PB_SIZE * 2)
+1:      ld1            {v0.8b, v1.8b}, [x1], x2
+        ushll           v4.8h, v0.8b, #6
+        ushll           v5.8h, v1.8b, #6
+        st1            {v4.8h, v5.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1
+        mov             x7, #(MAX_PB_SIZE * 2)
+1:      ld1            {v0.8b, v1.8b, v2.8b}, [x1], x2
+        ushll           v4.8h, v0.8b, #6
+        ushll           v5.8h, v1.8b, #6
+        ushll           v6.8h, v2.8b, #6
+        st1            {v4.8h, v5.8h, v6.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1
+        mov             x7, #(MAX_PB_SIZE * 2)
+1:      ld1            {v0.8b, v1.8b, v2.8b, v3.8b}, [x1], x2
+        ushll           v4.8h, v0.8b, #6
+        ushll           v5.8h, v1.8b, #6
+        ushll           v6.8h, v2.8b, #6
+        ushll           v7.8h, v3.8b, #6
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels48_8_neon, export=1
+        mov             x7, #(MAX_PB_SIZE)
+1:      ld1            {v0.16b, v1.16b, v2.16b}, [x1], x2
+        ushll           v4.8h, v0.8b, #6
+        ushll2          v5.8h, v0.16b, #6
+        ushll           v6.8h, v1.8b, #6
+        ushll2          v7.8h, v1.16b, #6
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        ushll           v4.8h, v2.8b, #6
+        ushll2          v5.8h, v2.16b, #6
+        st1            {v4.8h, v5.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
+1:      ld1            {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
+        ushll           v4.8h, v0.8b, #6
+        ushll2          v5.8h, v0.16b, #6
+        ushll           v6.8h, v1.8b, #6
+        ushll2          v7.8h, v1.16b, #6
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #(MAX_PB_SIZE)
+        ushll           v4.8h, v2.8b, #6
+        ushll2          v5.8h, v2.16b, #6
+        ushll           v6.8h, v3.8b, #6
+        ushll2          v7.8h, v3.16b, #6
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #(MAX_PB_SIZE)
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels4_8_neon, export=1
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1            {v0.s}[0], [x2], x3 // src
+        ushll           v16.8h, v0.8b, #6
+        ld1            {v20.4h}, [x4], x10 // src2
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqrshrun        v0.8b,  v16.8h, #7
+        st1            {v0.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels6_8_neon, export=1
+        mov             x10, #(MAX_PB_SIZE * 2 - 8)
+        sub             x1, x1, #4
+1:      ld1            {v0.8b}, [x2], x3
+        ushll           v16.8h, v0.8b, #6
+        ld1            {v20.4h}, [x4], #8
+        ld1            {v20.s}[2], [x4], x10
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqrshrun        v0.8b,  v16.8h, #7
+        st1            {v0.s}[0], [x0], #4
+        st1            {v0.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels8_8_neon, export=1
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1            {v0.8b}, [x2], x3     // src
+        ushll           v16.8h, v0.8b, #6
+        ld1            {v20.8h}, [x4], x10   // src2
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqrshrun        v0.8b,  v16.8h, #7
+        st1            {v0.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels12_8_neon, export=1
+        mov             x10, #(MAX_PB_SIZE * 2 - 16)
+        sub             x1, x1, #8
+1:      ld1            {v0.16b}, [x2], x3
+        ushll           v16.8h, v0.8b, #6
+        ushll2          v17.8h, v0.16b, #6
+        ld1            {v20.8h}, [x4], #16
+        ld1            {v21.4h}, [x4], x10
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqadd           v17.8h, v17.8h, v21.8h
+        sqrshrun        v0.8b,  v16.8h, #7
+        sqrshrun2       v0.16b, v17.8h, #7
+        st1            {v0.8b}, [x0], #8
+        st1            {v0.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels16_8_neon, export=1
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1            {v0.16b}, [x2], x3            // src
+        ushll           v16.8h, v0.8b, #6
+        ushll2          v17.8h, v0.16b, #6
+        ld1            {v20.8h, v21.8h}, [x4], x10   // src2
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqadd           v17.8h, v17.8h, v21.8h
+        sqrshrun        v0.8b,  v16.8h, #7
+        sqrshrun2       v0.16b, v17.8h, #7
+        st1            {v0.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels24_8_neon, export=1
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1            {v0.8b, v1.8b, v2.8b}, [x2], x3  // src
+        ushll           v16.8h, v0.8b, #6
+        ushll           v17.8h, v1.8b, #6
+        ushll           v18.8h, v2.8b, #6
+        ld1            {v20.8h, v21.8h, v22.8h}, [x4], x10   // src2
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqadd           v17.8h, v17.8h, v21.8h
+        sqadd           v18.8h, v18.8h, v22.8h
+        sqrshrun        v0.8b, v16.8h, #7
+        sqrshrun        v1.8b, v17.8h, #7
+        sqrshrun        v2.8b, v18.8h, #7
+        st1            {v0.8b, v1.8b, v2.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels32_8_neon, export=1
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1            {v0.16b, v1.16b}, [x2], x3            // src
+        ushll           v16.8h, v0.8b, #6
+        ushll2          v17.8h, v0.16b, #6
+        ushll           v18.8h, v1.8b, #6
+        ushll2          v19.8h, v1.16b, #6
+        ld1            {v20.8h, v21.8h, v22.8h, v23.8h}, [x4], x10   // src2
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqadd           v17.8h, v17.8h, v21.8h
+        sqadd           v18.8h, v18.8h, v22.8h
+        sqadd           v19.8h, v19.8h, v23.8h
+        sqrshrun        v0.8b,  v16.8h, #7
+        sqrshrun2       v0.16b, v17.8h, #7
+        sqrshrun        v1.8b,  v18.8h, #7
+        sqrshrun2       v1.16b, v19.8h, #7
+        st1            {v0.16b, v1.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels48_8_neon, export=1
+        mov             x10, #(MAX_PB_SIZE)
+1:      ld1            {v0.16b, v1.16b, v2.16b}, [x2], x3            // src
+        ushll           v16.8h, v0.8b, #6
+        ushll2          v17.8h, v0.16b, #6
+        ushll           v18.8h, v1.8b, #6
+        ushll2          v19.8h, v1.16b, #6
+        ushll           v20.8h, v2.8b, #6
+        ushll2          v21.8h, v2.16b, #6
+        ld1            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(MAX_PB_SIZE)        // src2
+        sqadd           v16.8h, v16.8h, v24.8h
+        sqadd           v17.8h, v17.8h, v25.8h
+        sqadd           v18.8h, v18.8h, v26.8h
+        sqadd           v19.8h, v19.8h, v27.8h
+        ld1            {v24.8h, v25.8h}, [x4], x10
+        sqadd           v20.8h, v20.8h, v24.8h
+        sqadd           v21.8h, v21.8h, v25.8h
+        sqrshrun        v0.8b, v16.8h, #7
+        sqrshrun2       v0.16b, v17.8h, #7
+        sqrshrun        v1.8b, v18.8h, #7
+        sqrshrun2       v1.16b, v19.8h, #7
+        sqrshrun        v2.8b, v20.8h, #7
+        sqrshrun2       v2.16b, v21.8h, #7
+        st1            {v0.16b, v1.16b, v2.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels64_8_neon, export=1
+1:      ld1            {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3            // src
+        ushll           v16.8h, v0.8b, #6
+        ushll2          v17.8h, v0.16b, #6
+        ushll           v18.8h, v1.8b, #6
+        ushll2          v19.8h, v1.16b, #6
+        ushll           v20.8h, v2.8b, #6
+        ushll2          v21.8h, v2.16b, #6
+        ushll           v22.8h, v3.8b, #6
+        ushll2          v23.8h, v3.16b, #6
+        ld1            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(MAX_PB_SIZE)        // src2
+        sqadd           v16.8h, v16.8h, v24.8h
+        sqadd           v17.8h, v17.8h, v25.8h
+        sqadd           v18.8h, v18.8h, v26.8h
+        sqadd           v19.8h, v19.8h, v27.8h
+        ld1            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(MAX_PB_SIZE)
+        sqadd           v20.8h, v20.8h, v24.8h
+        sqadd           v21.8h, v21.8h, v25.8h
+        sqadd           v22.8h, v22.8h, v26.8h
+        sqadd           v23.8h, v23.8h, v27.8h
+        sqrshrun        v0.8b, v16.8h, #7
+        sqrshrun2       v0.16b, v17.8h, #7
+        sqrshrun        v1.8b, v18.8h, #7
+        sqrshrun2       v1.16b, v19.8h, #7
+        sqrshrun        v2.8b, v20.8h, #7
+        sqrshrun2       v2.16b, v21.8h, #7
+        sqrshrun        v3.8b, v22.8h, #7
+        sqrshrun2       v3.16b, v23.8h, #7
+        st1            {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+.Lepel_filters:
+        .byte  0,  0,  0,  0
+        .byte -2, 58, 10, -2
+        .byte -4, 54, 16, -2
+        .byte -6, 46, 28, -4
+        .byte -4, 36, 36, -4
+        .byte -4, 28, 46, -6
+        .byte -2, 16, 54, -4
+        .byte -2, 10, 58, -2
+
+.macro load_epel_filterb freg, xreg
+        adr             \xreg, .Lepel_filters
+        add             \xreg, \xreg, \freg, lsl #2
+        ld4r           {v0.16b, v1.16b, v2.16b, v3.16b}, [\xreg] // filter
+        neg             v0.16b, v0.16b
+        neg             v3.16b, v3.16b
+.endm
+
+.macro calc_epelb dst, src1, src2, src3, src4
+        umlsl           \dst\().8h, \src1\().8b, v0.8b
+        umlal           \dst\().8h, \src2\().8b, v1.8b
+        umlal           \dst\().8h, \src3\().8b, v2.8b
+        umlsl           \dst\().8h, \src4\().8b, v3.8b
+.endm
+
+.macro calc_epelb2 dst, src1, src2, src3, src4
+        umlsl2          \dst\().8h, \src1\().16b, v0.16b
+        umlal2          \dst\().8h, \src2\().16b, v1.16b
+        umlal2          \dst\().8h, \src3\().16b, v2.16b
+        umlsl2          \dst\().8h, \src4\().16b, v3.16b
+.endm
+
+.macro load_epel_filterh freg, xreg
+        adr             \xreg, .Lepel_filters
+        add             \xreg, \xreg, \freg, lsl #2
+        ld1            {v0.8b}, [\xreg]
+        sxtl            v0.8h, v0.8b
+.endm
+
+.macro calc_epelh dst, src1, src2, src3, src4
+        smull           \dst\().4s, \src1\().4h, v0.h[0]
+        smlal           \dst\().4s, \src2\().4h, v0.h[1]
+        smlal           \dst\().4s, \src3\().4h, v0.h[2]
+        smlal           \dst\().4s, \src4\().4h, v0.h[3]
+        sqshrn          \dst\().4h, \dst\().4s, #6
+.endm
+
+.macro calc_epelh2 dst, tmp, src1, src2, src3, src4
+        smull2          \tmp\().4s, \src1\().8h, v0.h[0]
+        smlal2          \tmp\().4s, \src2\().8h, v0.h[1]
+        smlal2          \tmp\().4s, \src3\().8h, v0.h[2]
+        smlal2          \tmp\().4s, \src4\().8h, v0.h[3]
+        sqshrn2         \dst\().8h, \tmp\().4s, #6
+.endm
+
+function ff_hevc_put_hevc_epel_h4_8_neon, export=1
+        load_epel_filterb x4, x5
+        sub             x1, x1, #1
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1            {v4.8b}, [x1], x2
+        ushr            v5.2d, v4.2d, #8
+        ushr            v6.2d, v5.2d, #8
+        ushr            v7.2d, v6.2d, #8
+        movi            v16.8h, #0
+        calc_epelb      v16, v4, v5, v6, v7
+        st1            {v16.4h}, [x0], x10
+        subs            x3, x3, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h6_8_neon, export=1
+        load_epel_filterb x4, x5
+        sub             x1,  x1, #1
+        sub             x2,  x2, #8
+        mov             x10, #(MAX_PB_SIZE * 2 - 8)
+1:      ld1            {v24.8b},  [x1], #8
+        ushr            v26.2d, v24.2d, #8
+        ushr            v27.2d, v26.2d, #8
+        ushr            v28.2d, v27.2d, #8
+        movi            v16.8h,   #0
+        ld1            {v28.b}[5], [x1], x2
+        calc_epelb      v16, v24, v26, v27, v28
+        st1            {v16.4h},   [x0], #8
+        st1            {v16.s}[2], [x0], x10
+        subs            x3, x3,   #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h8_8_neon, export=1
+        load_epel_filterb x4, x5
+        sub             x1, x1, #1
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld2            {v24.8b, v25.8b}, [x1], x2
+        ushr            v26.2d, v24.2d, #8
+        ushr            v27.2d, v25.2d, #8
+        ushr            v28.2d, v26.2d, #8
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        calc_epelb      v16, v24, v25, v26, v27
+        calc_epelb      v17, v25, v26, v27, v28
+        st2            {v16.4h, v17.4h}, [x0], x10
+        subs            x3, x3, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h12_8_neon, export=1
+        load_epel_filterb x4, x5
+        sub             x1, x1, #1
+        mov             x10, #(MAX_PB_SIZE * 2 - 16)
+1:      ld2            {v24.8b, v25.8b}, [x1], x2
+        ushr            v26.2d, v24.2d, #8
+        ushr            v27.2d, v25.2d, #8
+        ushr            v28.2d, v26.2d, #8
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        calc_epelb      v16, v24, v25, v26, v27
+        calc_epelb      v17, v25, v26, v27, v28
+        zip1            v18.8h, v16.8h, v17.8h
+        zip2            v19.8h, v16.8h, v17.8h
+        st1            {v18.8h},   [x0], #16
+        st1            {v19.d}[0], [x0], x10
+        subs            x3, x3, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h16_8_neon, export=1
+        load_epel_filterb x4, x5
+        sub             x1, x1,  #1
+        sub             x2, x2, #16
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld2            {v24.8b, v25.8b}, [x1], #16
+        ld1            {v20.s}[0], [x1], x2
+        ushr            v26.2d, v24.2d, #8
+        ushr            v27.2d, v25.2d, #8
+        mov             v26.b[7], v20.b[0]
+        mov             v27.b[7], v20.b[1]
+        ushr            v28.2d, v26.2d, #8
+        mov             v28.b[7], v20.b[2]
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        calc_epelb      v16, v24, v25, v26, v27
+        calc_epelb      v17, v25, v26, v27, v28
+        st2            {v16.8h, v17.8h}, [x0], x10
+        subs            x3, x3, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h24_8_neon, export=1
+        load_epel_filterb x4, x5
+        sub             x1, x1, #1
+        sub             x2, x2, #24
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld3            {v24.8b, v25.8b, v26.8b}, [x1], #24
+        ld1            {v20.s}[0], [x1], x2
+        ushr            v27.2d, v24.2d, #8
+        ushr            v28.2d, v25.2d, #8
+        ushr            v29.2d, v26.2d, #8
+        mov             v27.b[7], v20.b[0]
+        mov             v28.b[7], v20.b[1]
+        mov             v29.b[7], v20.b[2]
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        movi            v18.8h, #0
+        calc_epelb      v16, v24, v25, v26, v27
+        calc_epelb      v17, v25, v26, v27, v28
+        calc_epelb      v18, v26, v27, v28, v29
+        st3            {v16.8h, v17.8h, v18.8h}, [x0], x10
+        subs            x3, x3, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h32_8_neon, export=1
+        load_epel_filterb x4, x5
+        sub             x1, x1, #1
+        sub             x2, x2, #32
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld4            {v24.8b, v25.8b, v26.8b, v27.8b}, [x1], #32
+        ld1            {v20.s}[0], [x1], x2
+        ushr            v28.2d, v24.2d, #8
+        ushr            v29.2d, v25.2d, #8
+        ushr            v30.2d, v26.2d, #8
+        ins             v28.b[7], v20.b[0]
+        ins             v29.b[7], v20.b[1]
+        ins             v30.b[7], v20.b[2]
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        movi            v18.8h, #0
+        movi            v19.8h, #0
+        calc_epelb      v16, v24, v25, v26, v27
+        calc_epelb      v17, v25, v26, v27, v28
+        calc_epelb      v18, v26, v27, v28, v29
+        calc_epelb      v19, v27, v28, v29, v30
+        st4            {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h48_8_neon, export=1
+        load_epel_filterb x4, x5
+        sub             x1, x1, #1
+        sub             x2, x2, #48
+        mov             x5, #24
+        mov             x10, #(128 - 48)
+1:      ld3            {v26.16b, v27.16b, v28.16b}, [x1], x5
+        ushr            v29.2d, v26.2d, #8
+        ushr            v30.2d, v27.2d, #8
+        ushr            v31.2d, v28.2d, #8
+        ld1            {v24.s}[0], [x1], x5
+        ld1            {v25.s}[0], [x1], x2
+        mov             v29.b[7], v24.b[0]
+        mov             v30.b[7], v24.b[1]
+        mov             v31.b[7], v24.b[2]
+        mov             v29.b[15], v25.b[0]
+        mov             v30.b[15], v25.b[1]
+        mov             v31.b[15], v25.b[2]
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        movi            v18.8h, #0
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        calc_epelb      v16, v26, v27, v28, v29
+        calc_epelb2     v20, v26, v27, v28, v29
+        calc_epelb      v17, v27, v28, v29, v30
+        calc_epelb2     v21, v27, v28, v29, v30
+        calc_epelb      v18, v28, v29, v30, v31
+        calc_epelb2     v22, v28, v29, v30, v31
+        st3            {v16.8h, v17.8h, v18.8h}, [x0], #48
+        st3            {v20.8h, v21.8h, v22.8h}, [x0], x10
+        subs            x3, x3, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h64_8_neon, export=1
+        load_epel_filterb x4, x5
+        sub             x1, x1, #1
+        sub             x2, x2, #64
+        mov             x7, #32
+1:      ld4            {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], x7
+        ushr            v28.2d, v24.2d, #8
+        ushr            v29.2d, v25.2d, #8
+        ushr            v30.2d, v26.2d, #8
+        ld1            {v4.s}[0], [x1], x7
+        ld1            {v5.s}[0], [x1], x2
+        ins             v28.b[7],  v4.b[0]
+        ins             v28.b[15], v5.b[0]
+        ins             v29.b[7],  v4.b[1]
+        ins             v29.b[15], v5.b[1]
+        ins             v30.b[7],  v4.b[2]
+        ins             v30.b[15], v5.b[2]
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        movi            v18.8h, #0
+        movi            v19.8h, #0
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        movi            v23.8h, #0
+        calc_epelb      v16, v24, v25, v26, v27
+        calc_epelb2     v20, v24, v25, v26, v27
+        calc_epelb      v17, v25, v26, v27, v28
+        calc_epelb2     v21, v25, v26, v27, v28
+        calc_epelb      v18, v26, v27, v28, v29
+        calc_epelb2     v22, v26, v27, v28, v29
+        calc_epelb      v19, v27, v28, v29, v30
+        calc_epelb2     v23, v27, v28, v29, v30
+        st4            {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
+        st4            {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v4_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.s}[0], [x1], x2
+        ld1            {v17.s}[0], [x1], x2
+        ld1            {v18.s}[0], [x1], x2
+1:      ld1            {v19.s}[0], [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        st1            {v4.4h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.s}[0], [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        st1            {v4.4h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.s}[0], [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        st1            {v4.4h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.s}[0], [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        st1            {v4.4h}, [x0], x10
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v6_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2 - 8)
+        ld1            {v16.8b}, [x1], x2
+        ld1            {v17.8b}, [x1], x2
+        ld1            {v18.8b}, [x1], x2
+1:      ld1            {v19.8b}, [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        st1            {v4.d}[0], [x0], #8
+        st1            {v4.s}[2], [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8b}, [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        st1            {v4.d}[0], [x0], #8
+        st1            {v4.s}[2], [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.8b}, [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        st1            {v4.d}[0], [x0], #8
+        st1            {v4.s}[2], [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.8b}, [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        st1            {v4.d}[0], [x0], #8
+        st1            {v4.s}[2], [x0], x10
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v8_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8b}, [x1], x2
+        ld1            {v17.8b}, [x1], x2
+        ld1            {v18.8b}, [x1], x2
+1:      ld1            {v19.8b}, [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        st1            {v4.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8b}, [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        st1            {v4.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.8b}, [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        st1            {v4.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.8b}, [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        st1            {v4.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v12_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2 - 16)
+        ld1            {v16.16b}, [x1], x2
+        ld1            {v17.16b}, [x1], x2
+        ld1            {v18.16b}, [x1], x2
+1:      ld1            {v19.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        calc_epelb2     v5, v16, v17, v18, v19
+        st1            {v4.8h}, [x0], #16
+        st1            {v5.d}[0], [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        calc_epelb2     v5, v17, v18, v19, v16
+        st1            {v4.8h}, [x0], #16
+        st1            {v5.d}[0], [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        calc_epelb2     v5, v18, v19, v16, v17
+        st1            {v4.8h}, [x0], #16
+        st1            {v5.d}[0], [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        calc_epelb2     v5, v19, v16, v17, v18
+        st1            {v4.8h}, [x0], #16
+        st1            {v5.d}[0], [x0], x10
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v16_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.16b}, [x1], x2
+        ld1            {v17.16b}, [x1], x2
+        ld1            {v18.16b}, [x1], x2
+1:      ld1            {v19.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        calc_epelb2     v5, v16, v17, v18, v19
+        st1            {v4.8h, v5.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        calc_epelb2     v5, v17, v18, v19, v16
+        st1            {v4.8h, v5.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        calc_epelb2     v5, v18, v19, v16, v17
+        st1            {v4.8h, v5.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        calc_epelb2     v5, v19, v16, v17, v18
+        st1            {v4.8h, v5.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v24_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8b, v17.8b, v18.8b}, [x1], x2
+        ld1            {v19.8b, v20.8b, v21.8b}, [x1], x2
+        ld1            {v22.8b, v23.8b, v24.8b}, [x1], x2
+1:      ld1            {v25.8b, v26.8b, v27.8b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, v16, v19, v22, v25
+        calc_epelb      v5, v17, v20, v23, v26
+        calc_epelb      v6, v18, v21, v24, v27
+        st1            {v4.8h, v5.8h, v6.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8b, v17.8b, v18.8b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, v19, v22, v25, v16
+        calc_epelb      v5, v20, v23, v26, v17
+        calc_epelb      v6, v21, v24, v27, v18
+        st1            {v4.8h, v5.8h, v6.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v19.8b, v20.8b, v21.8b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, v22, v25, v16, v19
+        calc_epelb      v5, v23, v26, v17, v20
+        calc_epelb      v6, v24, v27, v18, v21
+        st1            {v4.8h, v5.8h, v6.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.8b, v23.8b, v24.8b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, v25, v16, v19, v22
+        calc_epelb      v5, v26, v17, v20, v23
+        calc_epelb      v6, v27, v18, v21, v24
+        st1            {v4.8h, v5.8h, v6.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v32_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.16b, v17.16b}, [x1], x2
+        ld1            {v18.16b, v19.16b}, [x1], x2
+        ld1            {v20.16b, v21.16b}, [x1], x2
+1:      ld1            {v22.16b, v23.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, v16, v18, v20, v22
+        calc_epelb2     v5, v16, v18, v20, v22
+        calc_epelb      v6, v17, v19, v21, v23
+        calc_epelb2     v7, v17, v19, v21, v23
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.16b, v17.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, v18, v20, v22, v16
+        calc_epelb2     v5, v18, v20, v22, v16
+        calc_epelb      v6, v19, v21, v23, v17
+        calc_epelb2     v7, v19, v21, v23, v17
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.16b, v19.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, v20, v22, v16, v18
+        calc_epelb2     v5, v20, v22, v16, v18
+        calc_epelb      v6, v21, v23, v17, v19
+        calc_epelb2     v7, v21, v23, v17, v19
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.16b, v21.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, v22, v16, v18, v20
+        calc_epelb2     v5, v22, v16, v18, v20
+        calc_epelb      v6, v23, v17, v19, v21
+        calc_epelb2     v7, v23, v17, v19, v21
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v48_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #64
+        ld1            {v16.16b, v17.16b, v18.16b}, [x1], x2
+        ld1            {v19.16b, v20.16b, v21.16b}, [x1], x2
+        ld1            {v22.16b, v23.16b, v24.16b}, [x1], x2
+1:      ld1            {v25.16b, v26.16b, v27.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_epelb      v4,  v16, v19, v22, v25
+        calc_epelb2     v5,  v16, v19, v22, v25
+        calc_epelb      v6,  v17, v20, v23, v26
+        calc_epelb2     v7,  v17, v20, v23, v26
+        calc_epelb      v28, v18, v21, v24, v27
+        calc_epelb2     v29, v18, v21, v24, v27
+        st1            { v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        st1            {v28.8h, v29.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.16b, v17.16b, v18.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_epelb      v4,  v19, v22, v25, v16
+        calc_epelb2     v5,  v19, v22, v25, v16
+        calc_epelb      v6,  v20, v23, v26, v17
+        calc_epelb2     v7,  v20, v23, v26, v17
+        calc_epelb      v28, v21, v24, v27, v18
+        calc_epelb2     v29, v21, v24, v27, v18
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        st1            {v28.8h, v29.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v19.16b, v20.16b, v21.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_epelb      v4,  v22, v25, v16, v19
+        calc_epelb2     v5,  v22, v25, v16, v19
+        calc_epelb      v6,  v23, v26, v17, v20
+        calc_epelb2     v7,  v23, v26, v17, v20
+        calc_epelb      v28, v24, v27, v18, v21
+        calc_epelb2     v29, v24, v27, v18, v21
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        st1            {v28.8h, v29.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.16b, v23.16b, v24.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_epelb      v4,  v25, v16, v19, v22
+        calc_epelb2     v5,  v25, v16, v19, v22
+        calc_epelb      v6,  v26, v17, v20, v23
+        calc_epelb2     v7,  v26, v17, v20, v23
+        calc_epelb      v28, v27, v18, v21, v24
+        calc_epelb2     v29, v27, v18, v21, v24
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        st1            {v28.8h, v29.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v64_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        ld1            {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2
+        ld1            {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], x2
+        ld1            {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], x2
+1:      ld1            {v28.16b, v29.16b, v30.16b, v31.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_epelb      v4,  v16, v20, v24, v28
+        calc_epelb2     v5,  v16, v20, v24, v28
+        calc_epelb      v6,  v17, v21, v25, v29
+        calc_epelb2     v7,  v17, v21, v25, v29
+        calc_epelb      v8,  v18, v22, v26, v30
+        calc_epelb2     v9,  v18, v22, v26, v30
+        calc_epelb      v10, v19, v23, v27, v31
+        calc_epelb2     v11, v19, v23, v27, v31
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #64
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_epelb      v4,  v20, v24, v28, v16
+        calc_epelb2     v5,  v20, v24, v28, v16
+        calc_epelb      v6,  v21, v25, v29, v17
+        calc_epelb2     v7,  v21, v25, v29, v17
+        calc_epelb      v8,  v22, v26, v30, v18
+        calc_epelb2     v9,  v22, v26, v30, v18
+        calc_epelb      v10, v23, v27, v31, v19
+        calc_epelb2     v11, v23, v27, v31, v19
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #64
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_epelb      v4,  v24, v28, v16, v20
+        calc_epelb2     v5,  v24, v28, v16, v20
+        calc_epelb      v6,  v25, v29, v17, v21
+        calc_epelb2     v7,  v25, v29, v17, v21
+        calc_epelb      v8,  v26, v30, v18, v22
+        calc_epelb2     v9,  v26, v30, v18, v22
+        calc_epelb      v10, v27, v31, v19, v23
+        calc_epelb2     v11, v27, v31, v19, v23
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #64
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_epelb      v4, v28, v16, v20, v24
+        calc_epelb2     v5, v28, v16, v20, v24
+        calc_epelb      v6, v29, v17, v21, v25
+        calc_epelb2     v7, v29, v17, v21, v25
+        calc_epelb      v8, v30, v18, v22, v26
+        calc_epelb2     v9, v30, v18, v22, v26
+        calc_epelb      v10, v31, v19, v23, v27
+        calc_epelb2     v11, v31, v19, v23, v27
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #64
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv4_8_neon, export=1
+        add             x10, x3, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             x3, x3, #3
+        bl              X(ff_hevc_put_hevc_epel_h4_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0, x3, [sp], #16
+        load_epel_filterh x5, x4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.4h}, [sp], x10
+        ld1            {v17.4h}, [sp], x10
+        ld1            {v18.4h}, [sp], x10
+1:      ld1            {v19.4h}, [sp], x10
+        calc_epelh      v4, v16, v17, v18, v19
+        st1            {v4.4h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.4h}, [sp], x10
+        calc_epelh      v4, v17, v18, v19, v16
+        st1            {v4.4h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.4h}, [sp], x10
+        calc_epelh      v4, v18, v19, v16, v17
+        st1            {v4.4h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.4h}, [sp], x10
+        calc_epelh      v4, v19, v16, v17, v18
+        st1            {v4.4h}, [x0], x10
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv6_8_neon, export=1
+        add             x10,  x3, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0,  x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             x3, x3, #3
+        bl              X(ff_hevc_put_hevc_epel_h6_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0,  x3, [sp], #16
+        load_epel_filterh x5, x4
+        mov             x5, #120
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h}, [sp], x10
+        ld1            {v17.8h}, [sp], x10
+        ld1            {v18.8h}, [sp], x10
+1:      ld1            {v19.8h}, [sp], x10
+        calc_epelh      v4, v16, v17, v18, v19
+        calc_epelh2     v4, v5, v16, v17, v18, v19
+        st1            {v4.d}[0], [x0], #8
+        st1            {v4.s}[2], [x0], x5
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x10
+        calc_epelh      v4, v17, v18, v19, v16
+        calc_epelh2     v4, v5, v17, v18, v19, v16
+        st1            {v4.d}[0], [x0], #8
+        st1            {v4.s}[2], [x0], x5
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v19, v16, v17
+        calc_epelh2     v4, v5, v18, v19, v16, v17
+        st1            {v4.d}[0], [x0], #8
+        st1            {v4.s}[2], [x0], x5
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x10
+        calc_epelh      v4, v19, v16, v17, v18
+        calc_epelh2     v4, v5, v19, v16, v17, v18
+        st1            {v4.d}[0], [x0], #8
+        st1            {v4.s}[2], [x0], x5
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv8_8_neon, export=1
+        add             x10, x3, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             x3, x3, #3
+        bl              X(ff_hevc_put_hevc_epel_h8_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0, x3, [sp], #16
+        load_epel_filterh x5, x4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h}, [sp], x10
+        ld1            {v17.8h}, [sp], x10
+        ld1            {v18.8h}, [sp], x10
+1:      ld1            {v19.8h}, [sp], x10
+        calc_epelh      v4, v16, v17, v18, v19
+        calc_epelh2     v4, v5, v16, v17, v18, v19
+        st1            {v4.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x10
+        calc_epelh      v4, v17, v18, v19, v16
+        calc_epelh2     v4, v5, v17, v18, v19, v16
+        st1            {v4.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v19, v16, v17
+        calc_epelh2     v4, v5, v18, v19, v16, v17
+        st1            {v4.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x10
+        calc_epelh      v4, v19, v16, v17, v18
+        calc_epelh2     v4, v5, v19, v16, v17, v18
+        st1            {v4.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv12_8_neon, export=1
+        add             x10, x3, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             x3, x3, #3
+        bl              X(ff_hevc_put_hevc_epel_h12_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0, x3, [sp], #16
+        load_epel_filterh x5, x4
+        mov             x5, #112
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h, v17.8h}, [sp], x10
+        ld1            {v18.8h, v19.8h}, [sp], x10
+        ld1            {v20.8h, v21.8h}, [sp], x10
+1:      ld1            {v22.8h, v23.8h}, [sp], x10
+        calc_epelh      v4, v16, v18, v20, v22
+        calc_epelh2     v4, v5, v16, v18, v20, v22
+        calc_epelh      v5, v17, v19, v21, v23
+        st1            {v4.8h}, [x0], #16
+        st1            {v5.4h}, [x0], x5
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8h, v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v20, v22, v16
+        calc_epelh2     v4, v5, v18, v20, v22, v16
+        calc_epelh      v5, v19, v21, v23, v17
+        st1            {v4.8h}, [x0], #16
+        st1            {v5.4h}, [x0], x5
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.8h, v19.8h}, [sp], x10
+        calc_epelh      v4, v20, v22, v16, v18
+        calc_epelh2     v4, v5, v20, v22, v16, v18
+        calc_epelh      v5, v21, v23, v17, v19
+        st1            {v4.8h}, [x0], #16
+        st1            {v5.4h}, [x0], x5
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.8h, v21.8h}, [sp], x10
+        calc_epelh      v4, v22, v16, v18, v20
+        calc_epelh2     v4, v5, v22, v16, v18, v20
+        calc_epelh      v5, v23, v17, v19, v21
+        st1            {v4.8h}, [x0], #16
+        st1            {v5.4h}, [x0], x5
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv16_8_neon, export=1
+        add             x10, x3, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             x3, x3, #3
+        bl              X(ff_hevc_put_hevc_epel_h16_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0, x3, [sp], #16
+        load_epel_filterh x5, x4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h, v17.8h}, [sp], x10
+        ld1            {v18.8h, v19.8h}, [sp], x10
+        ld1            {v20.8h, v21.8h}, [sp], x10
+1:      ld1            {v22.8h, v23.8h}, [sp], x10
+        calc_epelh      v4, v16, v18, v20, v22
+        calc_epelh2     v4, v5, v16, v18, v20, v22
+        calc_epelh      v5, v17, v19, v21, v23
+        calc_epelh2     v5, v6, v17, v19, v21, v23
+        st1            {v4.8h, v5.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8h, v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v20, v22, v16
+        calc_epelh2     v4, v5, v18, v20, v22, v16
+        calc_epelh      v5, v19, v21, v23, v17
+        calc_epelh2     v5, v6, v19, v21, v23, v17
+        st1            {v4.8h, v5.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.8h, v19.8h}, [sp], x10
+        calc_epelh      v4, v20, v22, v16, v18
+        calc_epelh2     v4, v5, v20, v22, v16, v18
+        calc_epelh      v5, v21, v23, v17, v19
+        calc_epelh2     v5, v6, v21, v23, v17, v19
+        st1            {v4.8h, v5.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.8h, v21.8h}, [sp], x10
+        calc_epelh      v4, v22, v16, v18, v20
+        calc_epelh2     v4, v5, v22, v16, v18, v20
+        calc_epelh      v5, v23, v17, v19, v21
+        calc_epelh2     v5, v6, v23, v17, v19, v21
+        st1            {v4.8h, v5.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv24_8_neon, export=1
+        add             x10, x3, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             x3, x3, #3
+        bl              X(ff_hevc_put_hevc_epel_h24_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0, x3, [sp], #16
+        load_epel_filterh x5, x4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h, v17.8h, v18.8h}, [sp], x10
+        ld1            {v19.8h, v20.8h, v21.8h}, [sp], x10
+        ld1            {v22.8h, v23.8h, v24.8h}, [sp], x10
+1:      ld1            {v25.8h, v26.8h, v27.8h}, [sp], x10
+        calc_epelh      v4, v16, v19, v22, v25
+        calc_epelh2     v4, v5, v16, v19, v22, v25
+        calc_epelh      v5, v17, v20, v23, v26
+        calc_epelh2     v5, v6, v17, v20, v23, v26
+        calc_epelh      v6, v18, v21, v24, v27
+        calc_epelh2     v6, v7, v18, v21, v24, v27
+        st1            {v4.8h, v5.8h, v6.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8h, v17.8h, v18.8h}, [sp], x10
+        calc_epelh      v4, v19, v22, v25, v16
+        calc_epelh2     v4, v5, v19, v22, v25, v16
+        calc_epelh      v5, v20, v23, v26, v17
+        calc_epelh2     v5, v6, v20, v23, v26, v17
+        calc_epelh      v6, v21, v24, v27, v18
+        calc_epelh2     v6, v7, v21, v24, v27, v18
+        st1            {v4.8h, v5.8h, v6.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v19.8h, v20.8h, v21.8h}, [sp], x10
+        calc_epelh      v4, v22, v25, v16, v19
+        calc_epelh2     v4, v5, v22, v25, v16, v19
+        calc_epelh      v5, v23, v26, v17, v20
+        calc_epelh2     v5, v6, v23, v26, v17, v20
+        calc_epelh      v6, v24, v27, v18, v21
+        calc_epelh2     v6, v7, v24, v27, v18, v21
+        st1            {v4.8h, v5.8h, v6.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.8h, v23.8h, v24.8h}, [sp], x10
+        calc_epelh      v4, v25, v16, v19, v22
+        calc_epelh2     v4, v5, v25, v16, v19, v22
+        calc_epelh      v5, v26, v17, v20, v23
+        calc_epelh2     v5, v6, v26, v17, v20, v23
+        calc_epelh      v6, v27, v18, v21, v24
+        calc_epelh2     v6, v7, v27, v18, v21, v24
+        st1            {v4.8h, v5.8h, v6.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv32_8_neon, export=1
+        stp             xzr, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        mov             x6, #16
+        bl              X(ff_hevc_put_hevc_epel_hv16_8_neon)
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        add             x0, x0, #32
+        add             x1, x1, #16
+        mov             x6, #16
+        bl              X(ff_hevc_put_hevc_epel_hv16_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv48_8_neon, export=1
+        stp             xzr, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        mov             x6, #24
+        bl              X(ff_hevc_put_hevc_epel_hv24_8_neon)
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        add             x0, x0, #48
+        add             x1, x1, #24
+        mov             x6, #24
+        bl              X(ff_hevc_put_hevc_epel_hv24_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv64_8_neon, export=1
+        stp             xzr, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        mov             x6, #16
+        bl              X(ff_hevc_put_hevc_epel_hv16_8_neon)
+        ldp             x4, x5, [sp]
+        ldp             x2, x3, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             x0, x0, #32
+        add             x1, x1, #16
+        mov             x6, #16
+        bl              X(ff_hevc_put_hevc_epel_hv16_8_neon)
+        ldp             x4, x5, [sp]
+        ldp             x2, x3, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             x0, x0, #64
+        add             x1, x1, #32
+        mov             x6, #16
+        bl              X(ff_hevc_put_hevc_epel_hv16_8_neon)
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        add             x0, x0, #96
+        add             x1, x1, #48
+        mov             x6, #16
+        bl              X(ff_hevc_put_hevc_epel_hv16_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v4_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             x2, x2, x3
+        ld1            {v16.s}[0], [x2], x3
+        ld1            {v17.s}[0], [x2], x3
+        ld1            {v18.s}[0], [x2], x3
+1:      ld1            {v19.s}[0], [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        sqrshrun        v4.8b,  v4.8h, #6
+        st1            {v4.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.s}[0], [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        sqrshrun        v4.8b,  v4.8h, #6
+        st1            {v4.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.s}[0], [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        sqrshrun        v4.8b,  v4.8h, #6
+        st1            {v4.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.s}[0], [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        sqrshrun        v4.8b,  v4.8h, #6
+        st1            {v4.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v6_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             x2, x2, x3
+        sub             x1, x1, #4
+        ld1            {v16.8b}, [x2], x3
+        ld1            {v17.8b}, [x2], x3
+        ld1            {v18.8b}, [x2], x3
+1:      ld1            {v19.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        sqrshrun        v4.8b,  v4.8h, #6
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        sqrshrun        v4.8b,  v4.8h, #6
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        sqrshrun        v4.8b,  v4.8h, #6
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        sqrshrun        v4.8b,  v4.8h, #6
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v8_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             x2, x2, x3
+        ld1            {v16.8b}, [x2], x3
+        ld1            {v17.8b}, [x2], x3
+        ld1            {v18.8b}, [x2], x3
+1:      ld1            {v19.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        sqrshrun        v4.8b,  v4.8h, #6
+        st1            {v4.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        sqrshrun        v4.8b,  v4.8h, #6
+        st1            {v4.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        sqrshrun        v4.8b,  v4.8h, #6
+        st1            {v4.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        sqrshrun        v4.8b,  v4.8h, #6
+        st1            {v4.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v12_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             x2, x2, x3
+        sub             x1, x1, #8
+        ld1            {v16.16b}, [x2], x3
+        ld1            {v17.16b}, [x2], x3
+        ld1            {v18.16b}, [x2], x3
+1:      ld1            {v19.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        calc_epelb2     v5, v16, v17, v18, v19
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        calc_epelb2     v5, v17, v18, v19, v16
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        calc_epelb2     v5, v18, v19, v16, v17
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        calc_epelb2     v5, v19, v16, v17, v18
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v16_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             x2, x2, x3
+        ld1            {v16.16b}, [x2], x3
+        ld1            {v17.16b}, [x2], x3
+        ld1            {v18.16b}, [x2], x3
+1:      ld1            {v19.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        calc_epelb2     v5, v16, v17, v18, v19
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        st1            {v4.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        calc_epelb2     v5, v17, v18, v19, v16
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        st1            {v4.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        calc_epelb2     v5, v18, v19, v16, v17
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        st1            {v4.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        calc_epelb2     v5, v19, v16, v17, v18
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        st1            {v4.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v24_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             x2, x2, x3
+        ld1            {v16.8b, v17.8b, v18.8b}, [x2], x3
+        ld1            {v19.8b, v20.8b, v21.8b}, [x2], x3
+        ld1            {v22.8b, v23.8b, v24.8b}, [x2], x3
+1:      ld1            {v25.8b, v26.8b, v27.8b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, v16, v19, v22, v25
+        calc_epelb      v5, v17, v20, v23, v26
+        calc_epelb      v6, v18, v21, v24, v27
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun        v5.8b,  v5.8h, #6
+        sqrshrun        v6.8b,  v6.8h, #6
+        st1            {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8b, v17.8b, v18.8b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, v19, v22, v25, v16
+        calc_epelb      v5, v20, v23, v26, v17
+        calc_epelb      v6, v21, v24, v27, v18
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun        v5.8b,  v5.8h, #6
+        sqrshrun        v6.8b,  v6.8h, #6
+        st1            {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v19.8b, v20.8b, v21.8b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, v22, v25, v16, v19
+        calc_epelb      v5, v23, v26, v17, v20
+        calc_epelb      v6, v24, v27, v18, v21
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun        v5.8b,  v5.8h, #6
+        sqrshrun        v6.8b,  v6.8h, #6
+        st1            {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v22.8b, v23.8b, v24.8b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, v25, v16, v19, v22
+        calc_epelb      v5, v26, v17, v20, v23
+        calc_epelb      v6, v27, v18, v21, v24
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun        v5.8b,  v5.8h, #6
+        sqrshrun        v6.8b,  v6.8h, #6
+        st1            {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v32_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             x2, x2, x3
+        ld1            {v16.16b, v17.16b}, [x2], x3
+        ld1            {v18.16b, v19.16b}, [x2], x3
+        ld1            {v20.16b, v21.16b}, [x2], x3
+1:      ld1            {v22.16b, v23.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, v16, v18, v20, v22
+        calc_epelb2     v5, v16, v18, v20, v22
+        calc_epelb      v6, v17, v19, v21, v23
+        calc_epelb2     v7, v17, v19, v21, v23
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b,  v7.8h, #6
+        st1            {v4.16b, v5.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.16b, v17.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, v18, v20, v22, v16
+        calc_epelb2     v5, v18, v20, v22, v16
+        calc_epelb      v6, v19, v21, v23, v17
+        calc_epelb2     v7, v19, v21, v23, v17
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b,  v7.8h, #6
+        st1            {v4.16b, v5.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.16b, v19.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, v20, v22, v16, v18
+        calc_epelb2     v5, v20, v22, v16, v18
+        calc_epelb      v6, v21, v23, v17, v19
+        calc_epelb2     v7, v21, v23, v17, v19
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b,  v7.8h, #6
+        st1            {v4.16b, v5.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v20.16b, v21.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, v22, v16, v18, v20
+        calc_epelb2     v5, v22, v16, v18, v20
+        calc_epelb      v6, v23, v17, v19, v21
+        calc_epelb2     v7, v23, v17, v19, v21
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b,  v7.8h, #6
+        st1            {v4.16b, v5.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v48_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             x2, x2, x3
+        ld1            {v16.16b, v17.16b, v18.16b}, [x2], x3
+        ld1            {v19.16b, v20.16b, v21.16b}, [x2], x3
+        ld1            {v22.16b, v23.16b, v24.16b}, [x2], x3
+1:      ld1            {v25.16b, v26.16b, v27.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_epelb      v4, v16, v19, v22, v25
+        calc_epelb2     v5, v16, v19, v22, v25
+        calc_epelb      v6, v17, v20, v23, v26
+        calc_epelb2     v7, v17, v20, v23, v26
+        calc_epelb      v28, v18, v21, v24, v27
+        calc_epelb2     v29, v18, v21, v24, v27
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b,  v7.8h, #6
+        sqrshrun        v6.8b,  v28.8h, #6
+        sqrshrun2       v6.16b,  v29.8h, #6
+        st1            {v4.16b, v5.16b, v6.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.16b, v17.16b, v18.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_epelb      v28, v21, v24, v27, v18
+        calc_epelb2     v29, v21, v24, v27, v18
+        calc_epelb      v4, v19, v22, v25, v16
+        calc_epelb2     v5, v19, v22, v25, v16
+        calc_epelb      v6, v20, v23, v26, v17
+        calc_epelb2     v7, v20, v23, v26, v17
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b,  v7.8h, #6
+        sqrshrun        v6.8b,  v28.8h, #6
+        sqrshrun2       v6.16b,  v29.8h, #6
+        st1            {v4.16b, v5.16b, v6.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v19.16b, v20.16b, v21.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_epelb      v28, v24, v27, v18, v21
+        calc_epelb2     v29, v24, v27, v18, v21
+        calc_epelb      v4, v22, v25, v16, v19
+        calc_epelb2     v5, v22, v25, v16, v19
+        calc_epelb      v6, v23, v26, v17, v20
+        calc_epelb2     v7, v23, v26, v17, v20
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b,  v7.8h, #6
+        sqrshrun        v6.8b,  v28.8h, #6
+        sqrshrun2       v6.16b,  v29.8h, #6
+        st1            {v4.16b, v5.16b, v6.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v22.16b, v23.16b, v24.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_epelb      v28, v27, v18, v21, v24
+        calc_epelb2     v29, v27, v18, v21, v24
+        calc_epelb      v4, v25, v16, v19, v22
+        calc_epelb2     v5, v25, v16, v19, v22
+        calc_epelb      v6, v26, v17, v20, v23
+        calc_epelb2     v7, v26, v17, v20, v23
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b,  v7.8h, #6
+        sqrshrun        v6.8b,  v28.8h, #6
+        sqrshrun2       v6.16b,  v29.8h, #6
+        st1            {v4.16b, v5.16b, v6.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v64_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             x2, x2, x3
+        ld1            {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
+        ld1            {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
+        ld1            {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], x3
+1:      ld1            {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_epelb      v10, v19, v23, v27, v31
+        calc_epelb2     v11, v19, v23, v27, v31
+        calc_epelb      v4, v16, v20, v24, v28
+        calc_epelb2     v5, v16, v20, v24, v28
+        calc_epelb      v6, v17, v21, v25, v29
+        calc_epelb2     v7, v17, v21, v25, v29
+        calc_epelb      v8, v18, v22, v26, v30
+        calc_epelb2     v9, v18, v22, v26, v30
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b,  v7.8h, #6
+        sqrshrun        v6.8b,  v8.8h, #6
+        sqrshrun2       v6.16b,  v9.8h, #6
+        sqrshrun        v7.8b,  v10.8h, #6
+        sqrshrun2       v7.16b,  v11.8h, #6
+        st1            {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_epelb      v10, v23, v27, v31, v19
+        calc_epelb2     v11, v23, v27, v31, v19
+        calc_epelb      v4, v20, v24, v28, v16
+        calc_epelb2     v5, v20, v24, v28, v16
+        calc_epelb      v6, v21, v25, v29, v17
+        calc_epelb2     v7, v21, v25, v29, v17
+        calc_epelb      v8, v22, v26, v30, v18
+        calc_epelb2     v9, v22, v26, v30, v18
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b,  v7.8h, #6
+        sqrshrun        v6.8b,  v8.8h, #6
+        sqrshrun2       v6.16b,  v9.8h, #6
+        sqrshrun        v7.8b,  v10.8h, #6
+        sqrshrun2       v7.16b,  v11.8h, #6
+        st1            {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_epelb      v10, v27, v31, v19, v23
+        calc_epelb2     v11, v27, v31, v19, v23
+        calc_epelb      v4, v24, v28, v16, v20
+        calc_epelb2     v5, v24, v28, v16, v20
+        calc_epelb      v6, v25, v29, v17, v21
+        calc_epelb2     v7, v25, v29, v17, v21
+        calc_epelb      v8, v26, v30, v18, v22
+        calc_epelb2     v9, v26, v30, v18, v22
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b,  v7.8h, #6
+        sqrshrun        v6.8b,  v8.8h, #6
+        sqrshrun2       v6.16b,  v9.8h, #6
+        sqrshrun        v7.8b,  v10.8h, #6
+        sqrshrun2       v7.16b,  v11.8h, #6
+        st1            {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_epelb      v10, v31, v19, v23, v27
+        calc_epelb2     v11, v31, v19, v23, v27
+        calc_epelb      v4, v28, v16, v20, v24
+        calc_epelb2     v5, v28, v16, v20, v24
+        calc_epelb      v6, v29, v17, v21, v25
+        calc_epelb2     v7, v29, v17, v21, v25
+        calc_epelb      v8, v30, v18, v22, v26
+        calc_epelb2     v9, v30, v18, v22, v26
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b,  v7.8h, #6
+        sqrshrun        v6.8b,  v8.8h, #6
+        sqrshrun2       v6.16b,  v9.8h, #6
+        sqrshrun        v7.8b,  v10.8h, #6
+        sqrshrun2       v7.16b,  v11.8h, #6
+        st1            {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv4_8_neon, export=1
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             xzr, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h4_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.4h}, [sp], x10
+        ld1            {v17.4h}, [sp], x10
+        ld1            {v18.4h}, [sp], x10
+1:      ld1            {v19.4h}, [sp], x10
+        calc_epelh      v4, v16, v17, v18, v19
+        sqrshrun        v4.8b, v4.8h, #6
+        st1            {v4.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.4h}, [sp], x10
+        calc_epelh      v4, v17, v18, v19, v16
+        sqrshrun        v4.8b, v4.8h, #6
+        st1            {v4.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.4h}, [sp], x10
+        calc_epelh      v4, v18, v19, v16, v17
+        sqrshrun        v4.8b, v4.8h, #6
+        st1            {v4.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.4h}, [sp], x10
+        calc_epelh      v4, v19, v16, v17, v18
+        sqrshrun        v4.8b, v4.8h, #6
+        st1            {v4.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv6_8_neon, export=1
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             xzr, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h6_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x6, x5
+        sub             x1, x1, #4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h}, [sp], x10
+        ld1            {v17.8h}, [sp], x10
+        ld1            {v18.8h}, [sp], x10
+1:      ld1            {v19.8h}, [sp], x10
+        calc_epelh      v4, v16, v17, v18, v19
+        calc_epelh2     v4, v5, v16, v17, v18, v19
+        sqrshrun        v4.8b, v4.8h, #6
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x10
+        calc_epelh      v4, v17, v18, v19, v16
+        calc_epelh2     v4, v5, v17, v18, v19, v16
+        sqrshrun        v4.8b, v4.8h, #6
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v19, v16, v17
+        calc_epelh2     v4, v5, v18, v19, v16, v17
+        sqrshrun        v4.8b, v4.8h, #6
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x10
+        calc_epelh      v4, v19, v16, v17, v18
+        calc_epelh2     v4, v5, v19, v16, v17, v18
+        sqrshrun        v4.8b, v4.8h, #6
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv8_8_neon, export=1
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             xzr, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h8_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h}, [sp], x10
+        ld1            {v17.8h}, [sp], x10
+        ld1            {v18.8h}, [sp], x10
+1:      ld1            {v19.8h}, [sp], x10
+        calc_epelh      v4, v16, v17, v18, v19
+        calc_epelh2     v4, v5, v16, v17, v18, v19
+        sqrshrun        v4.8b, v4.8h, #6
+        st1            {v4.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x10
+        calc_epelh      v4, v17, v18, v19, v16
+        calc_epelh2     v4, v5, v17, v18, v19, v16
+        sqrshrun        v4.8b, v4.8h, #6
+        st1            {v4.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v19, v16, v17
+        calc_epelh2     v4, v5, v18, v19, v16, v17
+        sqrshrun        v4.8b, v4.8h, #6
+        st1            {v4.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x10
+        calc_epelh      v4, v19, v16, v17, v18
+        calc_epelh2     v4, v5, v19, v16, v17, v18
+        sqrshrun        v4.8b, v4.8h, #6
+        st1            {v4.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv12_8_neon, export=1
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             xzr, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h12_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x6, x5
+        sub             x1, x1, #8
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h, v17.8h}, [sp], x10
+        ld1            {v18.8h, v19.8h}, [sp], x10
+        ld1            {v20.8h, v21.8h}, [sp], x10
+1:      ld1            {v22.8h, v23.8h}, [sp], x10
+        calc_epelh      v4, v16, v18, v20, v22
+        calc_epelh2     v4, v5, v16, v18, v20, v22
+        calc_epelh      v5, v17, v19, v21, v23
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8h, v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v20, v22, v16
+        calc_epelh2     v4, v5, v18, v20, v22, v16
+        calc_epelh      v5, v19, v21, v23, v17
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.8h, v19.8h}, [sp], x10
+        calc_epelh      v4, v20, v22, v16, v18
+        calc_epelh2     v4, v5, v20, v22, v16, v18
+        calc_epelh      v5, v21, v23, v17, v19
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v20.8h, v21.8h}, [sp], x10
+        calc_epelh      v4, v22, v16, v18, v20
+        calc_epelh2     v4, v5, v22, v16, v18, v20
+        calc_epelh      v5, v23, v17, v19, v21
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv16_8_neon, export=1
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             xzr, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h16_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h, v17.8h}, [sp], x10
+        ld1            {v18.8h, v19.8h}, [sp], x10
+        ld1            {v20.8h, v21.8h}, [sp], x10
+1:      ld1            {v22.8h, v23.8h}, [sp], x10
+        calc_epelh      v4, v16, v18, v20, v22
+        calc_epelh2     v4, v5, v16, v18, v20, v22
+        calc_epelh      v5, v17, v19, v21, v23
+        calc_epelh2     v5, v6, v17, v19, v21, v23
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        st1            {v4.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8h, v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v20, v22, v16
+        calc_epelh2     v4, v5, v18, v20, v22, v16
+        calc_epelh      v5, v19, v21, v23, v17
+        calc_epelh2     v5, v6, v19, v21, v23, v17
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        st1            {v4.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.8h, v19.8h}, [sp], x10
+        calc_epelh      v4, v20, v22, v16, v18
+        calc_epelh2     v4, v5, v20, v22, v16, v18
+        calc_epelh      v5, v21, v23, v17, v19
+        calc_epelh2     v5, v6, v21, v23, v17, v19
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        st1            {v4.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v20.8h, v21.8h}, [sp], x10
+        calc_epelh      v4, v22, v16, v18, v20
+        calc_epelh2     v4, v5, v22, v16, v18, v20
+        calc_epelh      v5, v23, v17, v19, v21
+        calc_epelh2     v5, v6, v23, v17, v19, v21
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        st1            {v4.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv24_8_neon, export=1
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             xzr, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h24_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h, v17.8h, v18.8h}, [sp], x10
+        ld1            {v19.8h, v20.8h, v21.8h}, [sp], x10
+        ld1            {v22.8h, v23.8h, v24.8h}, [sp], x10
+1:      ld1            {v25.8h, v26.8h, v27.8h}, [sp], x10
+        calc_epelh      v4, v16, v19, v22, v25
+        calc_epelh2     v4, v5, v16, v19, v22, v25
+        calc_epelh      v5, v17, v20, v23, v26
+        calc_epelh2     v5, v6, v17, v20, v23, v26
+        calc_epelh      v6, v18, v21, v24, v27
+        calc_epelh2     v6, v7, v18, v21, v24, v27
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun        v5.8b, v5.8h, #6
+        sqrshrun        v6.8b, v6.8h, #6
+        st1            {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8h, v17.8h, v18.8h}, [sp], x10
+        calc_epelh      v4, v19, v22, v25, v16
+        calc_epelh2     v4, v5, v19, v22, v25, v16
+        calc_epelh      v5, v20, v23, v26, v17
+        calc_epelh2     v5, v6, v20, v23, v26, v17
+        calc_epelh      v6, v21, v24, v27, v18
+        calc_epelh2     v6, v7, v21, v24, v27, v18
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun        v5.8b, v5.8h, #6
+        sqrshrun        v6.8b, v6.8h, #6
+        st1            {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v19.8h, v20.8h, v21.8h}, [sp], x10
+        calc_epelh      v4, v22, v25, v16, v19
+        calc_epelh2     v4, v5, v22, v25, v16, v19
+        calc_epelh      v5, v23, v26, v17, v20
+        calc_epelh2     v5, v6, v23, v26, v17, v20
+        calc_epelh      v6, v24, v27, v18, v21
+        calc_epelh2     v6, v7, v24, v27, v18, v21
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun        v5.8b, v5.8h, #6
+        sqrshrun        v6.8b, v6.8h, #6
+        st1            {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v22.8h, v23.8h, v24.8h}, [sp], x10
+        calc_epelh      v4, v25, v16, v19, v22
+        calc_epelh2     v4, v5, v25, v16, v19, v22
+        calc_epelh      v5, v26, v17, v20, v23
+        calc_epelh2     v5, v6, v26, v17, v20, v23
+        calc_epelh      v6, v27, v18, v21, v24
+        calc_epelh2     v6, v7, v27, v18, v21, v24
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun        v5.8b, v5.8h, #6
+        sqrshrun        v6.8b, v6.8h, #6
+        st1            {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv32_8_neon, export=1
+        stp             x0, x30, [sp, #-16]!
+        stp             x1, x2, [sp, #-16]!
+        stp             x3, x4, [sp, #-16]!
+        stp             x5, x6, [sp, #-16]!
+        mov             x7, #16
+        bl              X(ff_hevc_put_hevc_epel_uni_hv16_8_neon)
+        ldp             x5, x6, [sp], #16
+        ldp             x3, x4, [sp], #16
+        ldp             x1, x2, [sp], #16
+        ldr             x0, [sp]
+        add             x0, x0, #16
+        add             x2, x2, #16
+        mov             x7, #16
+        bl              X(ff_hevc_put_hevc_epel_uni_hv16_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv48_8_neon, export=1
+        stp             x0, x30, [sp, #-16]!
+        stp             x1, x2, [sp, #-16]!
+        stp             x3, x4, [sp, #-16]!
+        stp             x5, x6, [sp, #-16]!
+        mov             x7, #24
+        bl              X(ff_hevc_put_hevc_epel_uni_hv24_8_neon)
+        ldp             x5, x6, [sp], #16
+        ldp             x3, x4, [sp], #16
+        ldp             x1, x2, [sp], #16
+        ldr             x0, [sp]
+        add             x0, x0, #24
+        add             x2, x2, #24
+        mov             x7, #24
+        bl              X(ff_hevc_put_hevc_epel_uni_hv24_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv64_8_neon, export=1
+        stp             x0, x30, [sp, #-16]!
+        stp             x1, x2, [sp, #-16]!
+        stp             x3, x4, [sp, #-16]!
+        stp             x5, x6, [sp, #-16]!
+        mov             x7, #16
+        bl              X(ff_hevc_put_hevc_epel_uni_hv16_8_neon)
+        ldp             x5, x6, [sp]
+        ldp             x3, x4, [sp, #16]
+        ldp             x1, x2, [sp, #32]
+        ldr             x0, [sp, #48]
+        add             x0, x0, #16
+        add             x2, x2, #16
+        mov             x7, #16
+        bl              X(ff_hevc_put_hevc_epel_uni_hv16_8_neon)
+        ldp             x5, x6, [sp]
+        ldp             x3, x4, [sp, #16]
+        ldp             x1, x2, [sp, #32]
+        ldr             x0, [sp, #48]
+        add             x0, x0, #32
+        add             x2, x2, #32
+        mov             x7, #16
+        bl              X(ff_hevc_put_hevc_epel_uni_hv16_8_neon)
+        ldp             x5, x6, [sp], #16
+        ldp             x3, x4, [sp], #16
+        ldp             x1, x2, [sp], #16
+        ldr             x0, [sp]
+        add             x0, x0, #48
+        add             x2, x2, #48
+        mov             x7, #16
+        bl              X(ff_hevc_put_hevc_epel_uni_hv16_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h4_8_neon, export=1
+        load_epel_filterb x6, x7
+        sub             x2, x2, #1
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1            {v4.8b}, [x2], x3
+        ushr            v5.2d, v4.2d, #8
+        ushr            v6.2d, v5.2d, #8
+        ushr            v7.2d, v6.2d, #8
+        movi            v16.8h, #0
+        calc_epelb      v16, v4, v5, v6, v7
+        ld1            {v20.4h}, [x4], x10
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqrshrun        v4.8b, v16.8h, #7
+        st1            {v4.s}[0], [x0], x1
+        subs            x5, x5, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h6_8_neon, export=1
+        load_epel_filterb x6, x7
+        sub             x1, x1, #4
+        sub             x2, x2, #1
+        sub             x3, x3, #8
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1            {v24.8b}, [x2], #8
+        ushr            v26.2d, v24.2d, #8
+        ushr            v27.2d, v26.2d, #8
+        ushr            v28.2d, v27.2d, #8
+        movi            v16.8h, #0
+        ld1            {v28.b}[5], [x2], x3
+        calc_epelb      v16, v24, v26, v27, v28
+        ld1            {v20.8h}, [x4], x10
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        st1            {v16.s}[0], [x0], #4
+        st1            {v16.h}[2], [x0], x1
+        subs            x5, x5, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h8_8_neon, export=1
+        load_epel_filterb x6, x7
+        sub             x2, x2, #1
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld2            {v24.8b, v25.8b}, [x2], x3
+        ushr            v26.2d, v24.2d, #8
+        ushr            v27.2d, v25.2d, #8
+        ushr            v28.2d, v26.2d, #8
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        calc_epelb      v16, v24, v25, v26, v27
+        calc_epelb      v17, v25, v26, v27, v28
+        zip1            v16.8h, v16.8h, v17.8h
+        ld1            {v20.8h}, [x4], x10
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        st1            {v16.8b}, [x0], x1
+        subs            x5, x5, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h12_8_neon, export=1
+        load_epel_filterb x6, x7
+        sub             x1, x1, #8
+        sub             x2, x2, #1
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld2            {v24.8b, v25.8b}, [x2], x3
+        ushr            v26.2d, v24.2d, #8
+        ushr            v27.2d, v25.2d, #8
+        ushr            v28.2d, v26.2d, #8
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        calc_epelb      v16, v24, v25, v26, v27
+        calc_epelb      v17, v25, v26, v27, v28
+        zip1            v18.8h, v16.8h, v17.8h
+        zip2            v19.8h, v16.8h, v17.8h
+        ld1            {v20.8h, v21.8h}, [x4], x10
+        sqadd           v18.8h, v18.8h, v20.8h
+        sqadd           v19.8h, v19.8h, v21.8h
+        sqrshrun        v20.8b, v18.8h, #7
+        sqrshrun        v21.8b, v19.8h, #7
+        st1            {v20.8b}, [x0], #8
+        st1            {v21.s}[0], [x0], x1
+        subs            x5, x5, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h16_8_neon, export=1
+        load_epel_filterb x6, x7
+        sub             x2, x2, #1
+        sub             x3, x3, #16
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld2            {v24.8b, v25.8b}, [x2], #16
+        ld1            {v20.s}[0], [x2], x3
+        ushr            v26.2d, v24.2d, #8
+        ushr            v27.2d, v25.2d, #8
+        mov             v26.b[7], v20.b[0]
+        mov             v27.b[7], v20.b[1]
+        ushr            v28.2d, v26.2d, #8
+        mov             v28.b[7], v20.b[2]
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        calc_epelb      v16, v24, v25, v26, v27
+        calc_epelb      v17, v25, v26, v27, v28
+        zip1            v18.8h, v16.8h, v17.8h
+        zip2            v19.8h, v16.8h, v17.8h
+        ld2            {v24.8h, v25.8h}, [x4], x10
+        sqadd           v16.8h, v16.8h, v24.8h
+        sqadd           v17.8h, v17.8h, v25.8h
+        sqrshrun        v4.8b, v16.8h, #7
+        sqrshrun        v5.8b, v17.8h, #7
+        st2            {v4.8b, v5.8b}, [x0], x1
+        subs            x5, x5, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h24_8_neon, export=1
+        load_epel_filterb x6, x7
+        sub             x2, x2, #1
+        sub             x3, x3, #24
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld3            {v24.8b, v25.8b, v26.8b}, [x2], #24
+        ld1            {v20.s}[0], [x2], x3
+        ushr            v27.2d, v24.2d, #8
+        ushr            v28.2d, v25.2d, #8
+        ushr            v29.2d, v26.2d, #8
+        mov             v27.b[7], v20.b[0]
+        mov             v28.b[7], v20.b[1]
+        mov             v29.b[7], v20.b[2]
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        movi            v18.8h, #0
+        calc_epelb      v16, v24, v25, v26, v27
+        calc_epelb      v17, v25, v26, v27, v28
+        calc_epelb      v18, v26, v27, v28, v29
+        ld3            {v24.8h, v25.8h, v26.8h}, [x4], x10
+        sqadd           v16.8h, v16.8h, v24.8h
+        sqadd           v17.8h, v17.8h, v25.8h
+        sqadd           v18.8h, v18.8h, v26.8h
+        sqrshrun        v4.8b, v16.8h, #7
+        sqrshrun        v5.8b, v17.8h, #7
+        sqrshrun        v6.8b, v18.8h, #7
+        st3            {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x5, x5, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h32_8_neon, export=1
+        load_epel_filterb x6, x7
+        sub             x2, x2, #1
+        sub             x3, x3, #32
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld4            {v24.8b, v25.8b, v26.8b, v27.8b}, [x2], #32
+        ld1            {v20.s}[0], [x2], x3
+        ushr            v28.2d, v24.2d, #8
+        ushr            v29.2d, v25.2d, #8
+        ushr            v30.2d, v26.2d, #8
+        ins             v28.b[7], v20.b[0]
+        ins             v29.b[7], v20.b[1]
+        ins             v30.b[7], v20.b[2]
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        movi            v18.8h, #0
+        movi            v19.8h, #0
+        calc_epelb      v16, v24, v25, v26, v27
+        calc_epelb      v17, v25, v26, v27, v28
+        calc_epelb      v18, v26, v27, v28, v29
+        calc_epelb      v19, v27, v28, v29, v30
+        ld4            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], x10
+        sqadd           v16.8h, v16.8h, v24.8h
+        sqadd           v17.8h, v17.8h, v25.8h
+        sqadd           v18.8h, v18.8h, v26.8h
+        sqadd           v19.8h, v19.8h, v27.8h
+        sqrshrun        v4.8b, v16.8h, #7
+        sqrshrun        v5.8b, v17.8h, #7
+        sqrshrun        v6.8b, v18.8h, #7
+        sqrshrun        v7.8b, v19.8h, #7
+        st4            {v4.8b, v5.8b, v6.8b, v7.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h48_8_neon, export=1
+        load_epel_filterb x6, x7
+        sub             x2, x2, #1
+        sub             x3, x3, #48
+        mov             x7, #24
+        mov             x10, #(128 - 48)
+1:      ld3            {v26.16b, v27.16b, v28.16b}, [x2], x7
+        ushr            v29.2d, v26.2d, #8
+        ushr            v30.2d, v27.2d, #8
+        ushr            v31.2d, v28.2d, #8
+        ld1            {v24.s}[0], [x2], x7
+        ld1            {v25.s}[0], [x2], x3
+        mov             v29.b[7], v24.b[0]
+        mov             v30.b[7], v24.b[1]
+        mov             v31.b[7], v24.b[2]
+        mov             v29.b[15], v25.b[0]
+        mov             v30.b[15], v25.b[1]
+        mov             v31.b[15], v25.b[2]
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        movi            v18.8h, #0
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        calc_epelb      v16, v26, v27, v28, v29
+        calc_epelb2     v20, v26, v27, v28, v29
+        calc_epelb      v17, v27, v28, v29, v30
+        calc_epelb2     v21, v27, v28, v29, v30
+        calc_epelb      v18, v28, v29, v30, v31
+        calc_epelb2     v22, v28, v29, v30, v31
+        ld3            {v24.8h, v25.8h, v26.8h}, [x4], #48
+        sqadd           v16.8h, v16.8h, v24.8h
+        sqadd           v17.8h, v17.8h, v25.8h
+        sqadd           v18.8h, v18.8h, v26.8h
+        ld3            {v27.8h, v28.8h, v29.8h}, [x4], x10
+        sqadd           v20.8h, v20.8h, v27.8h
+        sqadd           v21.8h, v21.8h, v28.8h
+        sqadd           v22.8h, v22.8h, v29.8h
+        sqrshrun        v4.8b, v16.8h, #7
+        sqrshrun        v5.8b, v17.8h, #7
+        sqrshrun        v6.8b, v18.8h, #7
+        sqrshrun2       v4.16b, v20.8h, #7
+        sqrshrun2       v5.16b, v21.8h, #7
+        sqrshrun2       v6.16b, v22.8h, #7
+        st3            {v4.16b, v5.16b, v6.16b}, [x0], x1
+        subs            x5, x5, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h64_8_neon, export=1
+        load_epel_filterb x6, x7
+        sub             x2, x2, #1
+        sub             x3, x3, #64
+        mov             x7, #32
+1:      ld4            {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], x7
+        ushr            v28.2d, v24.2d, #8
+        ushr            v29.2d, v25.2d, #8
+        ushr            v30.2d, v26.2d, #8
+        ld1            {v4.s}[0], [x2], x7
+        ld1            {v5.s}[0], [x2], x3
+        ins             v28.b[7], v4.b[0]
+        ins             v28.b[15], v5.b[0]
+        ins             v29.b[7], v4.b[1]
+        ins             v29.b[15], v5.b[1]
+        ins             v30.b[7], v4.b[2]
+        ins             v30.b[15], v5.b[2]
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        movi            v18.8h, #0
+        movi            v19.8h, #0
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        movi            v23.8h, #0
+        calc_epelb      v16, v24, v25, v26, v27
+        calc_epelb2     v20, v24, v25, v26, v27
+        calc_epelb      v17, v25, v26, v27, v28
+        calc_epelb2     v21, v25, v26, v27, v28
+        calc_epelb      v18, v26, v27, v28, v29
+        calc_epelb2     v22, v26, v27, v28, v29
+        calc_epelb      v19, v27, v28, v29, v30
+        calc_epelb2     v23, v27, v28, v29, v30
+        ld4            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #64
+        sqadd           v16.8h, v16.8h, v24.8h
+        sqadd           v17.8h, v17.8h, v25.8h
+        sqadd           v18.8h, v18.8h, v26.8h
+        sqadd           v19.8h, v19.8h, v27.8h
+        ld4            {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
+        sqadd           v20.8h, v20.8h, v28.8h
+        sqadd           v21.8h, v21.8h, v29.8h
+        sqadd           v22.8h, v22.8h, v30.8h
+        sqadd           v23.8h, v23.8h, v31.8h
+        sqrshrun        v4.8b, v16.8h, #7
+        sqrshrun        v5.8b, v17.8h, #7
+        sqrshrun        v6.8b, v18.8h, #7
+        sqrshrun        v7.8b, v19.8h, #7
+        sqrshrun2       v4.16b, v20.8h, #7
+        sqrshrun2       v5.16b, v21.8h, #7
+        sqrshrun2       v6.16b, v22.8h, #7
+        sqrshrun2       v7.16b, v23.8h, #7
+        st4            {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v4_8_neon, export=1
+        load_epel_filterb x7, x6
+        sub             x2, x2, x3
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.s}[0], [x2], x3
+        ld1            {v17.s}[0], [x2], x3
+        ld1            {v18.s}[0], [x2], x3
+1:      ld1            {v19.s}[0], [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        ld1            {v24.4h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.s}[0], [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        ld1            {v24.4h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.s}[0], [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        ld1            {v24.4h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.s}[0], [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        ld1            {v24.4h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v6_8_neon, export=1
+        load_epel_filterb x7, x6
+        sub             x2, x2, x3
+        sub             x1, x1, #4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8b}, [x2], x3
+        ld1            {v17.8b}, [x2], x3
+        ld1            {v18.8b}, [x2], x3
+1:      ld1            {v19.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        ld1            {v24.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        ld1            {v24.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        ld1            {v24.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        ld1            {v24.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v8_8_neon, export=1
+        load_epel_filterb x7, x6
+        sub             x2, x2, x3
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8b}, [x2], x3
+        ld1            {v17.8b}, [x2], x3
+        ld1            {v18.8b}, [x2], x3
+1:      ld1            {v19.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        ld1            {v24.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        ld1            {v24.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        ld1            {v24.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        ld1            {v24.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v12_8_neon, export=1
+        load_epel_filterb x7, x6
+        sub             x1, x1, #8
+        sub             x2, x2, x3
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.16b}, [x2], x3
+        ld1            {v17.16b}, [x2], x3
+        ld1            {v18.16b}, [x2], x3
+1:      ld1            {v19.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        calc_epelb2     v5, v16, v17, v18, v19
+        ld1            {v24.8h, v25.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqadd           v5.8h, v5.8h, v25.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        calc_epelb2     v5, v17, v18, v19, v16
+        ld1            {v24.8h, v25.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqadd           v5.8h, v5.8h, v25.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        calc_epelb2     v5, v18, v19, v16, v17
+        ld1            {v24.8h, v25.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqadd           v5.8h, v5.8h, v25.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        calc_epelb2     v5, v19, v16, v17, v18
+        ld1            {v24.8h, v25.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqadd           v5.8h, v5.8h, v25.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v16_8_neon, export=1
+        load_epel_filterb x7, x6
+        sub             x2, x2, x3
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.16b}, [x2], x3
+        ld1            {v17.16b}, [x2], x3
+        ld1            {v18.16b}, [x2], x3
+1:      ld1            {v19.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        calc_epelb2     v5, v16, v17, v18, v19
+        ld1            {v24.8h, v25.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqadd           v5.8h, v5.8h, v25.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        calc_epelb2     v5, v17, v18, v19, v16
+        ld1            {v24.8h, v25.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqadd           v5.8h, v5.8h, v25.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        calc_epelb2     v5, v18, v19, v16, v17
+        ld1            {v24.8h, v25.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqadd           v5.8h, v5.8h, v25.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        calc_epelb2     v5, v19, v16, v17, v18
+        ld1            {v24.8h, v25.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqadd           v5.8h, v5.8h, v25.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v24_8_neon, export=1
+        load_epel_filterb x7, x6
+        sub             x2, x2, x3
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8b, v17.8b, v18.8b}, [x2], x3
+        ld1            {v19.8b, v20.8b, v21.8b}, [x2], x3
+        ld1            {v22.8b, v23.8b, v24.8b}, [x2], x3
+1:      ld1            {v25.8b, v26.8b, v27.8b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, v16, v19, v22, v25
+        calc_epelb      v5, v17, v20, v23, v26
+        calc_epelb      v6, v18, v21, v24, v27
+        ld1            {v28.8h, v29.8h, v30.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v28.8h
+        sqadd           v5.8h, v5.8h, v29.8h
+        sqadd           v6.8h, v6.8h, v30.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun        v5.8b, v5.8h, #7
+        sqrshrun        v6.8b, v6.8h, #7
+        st1            {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8b, v17.8b, v18.8b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, v19, v22, v25, v16
+        calc_epelb      v5, v20, v23, v26, v17
+        calc_epelb      v6, v21, v24, v27, v18
+        ld1            {v28.8h, v29.8h, v30.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v28.8h
+        sqadd           v5.8h, v5.8h, v29.8h
+        sqadd           v6.8h, v6.8h, v30.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun        v5.8b, v5.8h, #7
+        sqrshrun        v6.8b, v6.8h, #7
+        st1            {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v19.8b, v20.8b, v21.8b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, v22, v25, v16, v19
+        calc_epelb      v5, v23, v26, v17, v20
+        calc_epelb      v6, v24, v27, v18, v21
+        ld1            {v28.8h, v29.8h, v30.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v28.8h
+        sqadd           v5.8h, v5.8h, v29.8h
+        sqadd           v6.8h, v6.8h, v30.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun        v5.8b, v5.8h, #7
+        sqrshrun        v6.8b, v6.8h, #7
+        st1            {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v22.8b, v23.8b, v24.8b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, v25, v16, v19, v22
+        calc_epelb      v5, v26, v17, v20, v23
+        calc_epelb      v6, v27, v18, v21, v24
+        ld1            {v28.8h, v29.8h, v30.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v28.8h
+        sqadd           v5.8h, v5.8h, v29.8h
+        sqadd           v6.8h, v6.8h, v30.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun        v5.8b, v5.8h, #7
+        sqrshrun        v6.8b, v6.8h, #7
+        st1            {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v32_8_neon, export=1
+        load_epel_filterb x7, x6
+        sub             x2, x2, x3
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.16b, v17.16b}, [x2], x3
+        ld1            {v18.16b, v19.16b}, [x2], x3
+        ld1            {v20.16b, v21.16b}, [x2], x3
+1:      ld1            {v22.16b, v23.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, v16, v18, v20, v22
+        calc_epelb2     v5, v16, v18, v20, v22
+        calc_epelb      v6, v17, v19, v21, v23
+        calc_epelb2     v7, v17, v19, v21, v23
+        ld1            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqadd           v5.8h, v5.8h, v25.8h
+        sqadd           v6.8h, v6.8h, v26.8h
+        sqadd           v7.8h, v7.8h, v27.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        sqrshrun        v5.8b, v6.8h, #7
+        sqrshrun2       v5.16b, v7.8h, #7
+        st1            {v4.16b, v5.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.16b, v17.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, v18, v20, v22, v16
+        calc_epelb2     v5, v18, v20, v22, v16
+        calc_epelb      v6, v19, v21, v23, v17
+        calc_epelb2     v7, v19, v21, v23, v17
+        ld1            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqadd           v5.8h, v5.8h, v25.8h
+        sqadd           v6.8h, v6.8h, v26.8h
+        sqadd           v7.8h, v7.8h, v27.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        sqrshrun        v5.8b, v6.8h, #7
+        sqrshrun2       v5.16b, v7.8h, #7
+        st1            {v4.16b, v5.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.16b, v19.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, v20, v22, v16, v18
+        calc_epelb2     v5, v20, v22, v16, v18
+        calc_epelb      v6, v21, v23, v17, v19
+        calc_epelb2     v7, v21, v23, v17, v19
+        ld1            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqadd           v5.8h, v5.8h, v25.8h
+        sqadd           v6.8h, v6.8h, v26.8h
+        sqadd           v7.8h, v7.8h, v27.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        sqrshrun        v5.8b, v6.8h, #7
+        sqrshrun2       v5.16b, v7.8h, #7
+        st1            {v4.16b, v5.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v20.16b, v21.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, v22, v16, v18, v20
+        calc_epelb2     v5, v22, v16, v18, v20
+        calc_epelb      v6, v23, v17, v19, v21
+        calc_epelb2     v7, v23, v17, v19, v21
+        ld1            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqadd           v5.8h, v5.8h, v25.8h
+        sqadd           v6.8h, v6.8h, v26.8h
+        sqadd           v7.8h, v7.8h, v27.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        sqrshrun        v5.8b, v6.8h, #7
+        sqrshrun2       v5.16b, v7.8h, #7
+        st1            {v4.16b, v5.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v48_8_neon, export=1
+        stp             x7, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_bi_v24_8_neon)
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        add             x0, x0, #24
+        add             x2, x2, #24
+        add             x4, x4, #48
+        ldr             x7, [sp]
+        bl              X(ff_hevc_put_hevc_epel_bi_v24_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v64_8_neon, export=1
+        stp             x7, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_bi_v32_8_neon)
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        add             x0, x0, #32
+        add             x2, x2, #32
+        add             x4, x4, #64
+        ldr             x7, [sp]
+        bl              X(ff_hevc_put_hevc_epel_bi_v32_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv4_8_neon, export=1
+        add             x10, x5, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x5, #3
+        mov             x4, x6
+        mov             x5, x7
+        bl              X(ff_hevc_put_hevc_epel_h4_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x7, x6
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.4h}, [sp], x10
+        ld1            {v17.4h}, [sp], x10
+        ld1            {v18.4h}, [sp], x10
+1:      ld1            {v19.4h}, [sp], x10
+        calc_epelh      v4, v16, v17, v18, v19
+        ld1            {v6.4h}, [x4], x10
+        sqadd           v4.4h, v4.4h, v6.4h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.4h}, [sp], x10
+        calc_epelh      v4, v17, v18, v19, v16
+        ld1            {v6.4h}, [x4], x10
+        sqadd           v4.4h, v4.4h, v6.4h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.4h}, [sp], x10
+        calc_epelh      v4, v18, v19, v16, v17
+        ld1            {v6.4h}, [x4], x10
+        sqadd           v4.4h, v4.4h, v6.4h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.4h}, [sp], x10
+        calc_epelh      v4, v19, v16, v17, v18
+        ld1            {v6.4h}, [x4], x10
+        sqadd           v4.4h, v4.4h, v6.4h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv6_8_neon, export=1
+        add             x10, x5, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x5, #3
+        mov             x4, x6
+        mov             x5, x7
+        bl              X(ff_hevc_put_hevc_epel_h6_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x7, x6
+        sub             x1, x1, #4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h}, [sp], x10
+        ld1            {v17.8h}, [sp], x10
+        ld1            {v18.8h}, [sp], x10
+1:      ld1            {v19.8h}, [sp], x10
+        calc_epelh      v4, v16, v17, v18, v19
+        calc_epelh2     v4, v5, v16, v17, v18, v19
+        ld1            {v6.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x10
+        calc_epelh      v4, v17, v18, v19, v16
+        calc_epelh2     v4, v5, v17, v18, v19, v16
+        ld1            {v6.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v19, v16, v17
+        calc_epelh2     v4, v5, v18, v19, v16, v17
+        ld1            {v6.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x10
+        calc_epelh      v4, v19, v16, v17, v18
+        calc_epelh2     v4, v5, v19, v16, v17, v18
+        ld1            {v6.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv8_8_neon, export=1
+        add             x10, x5, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x5, #3
+        mov             x4, x6
+        mov             x5, x7
+        bl              X(ff_hevc_put_hevc_epel_h8_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x7, x6
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h}, [sp], x10
+        ld1            {v17.8h}, [sp], x10
+        ld1            {v18.8h}, [sp], x10
+1:      ld1            {v19.8h}, [sp], x10
+        calc_epelh      v4, v16, v17, v18, v19
+        calc_epelh2     v4, v5, v16, v17, v18, v19
+        ld1            {v6.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x10
+        calc_epelh      v4, v17, v18, v19, v16
+        calc_epelh2     v4, v5, v17, v18, v19, v16
+        ld1            {v6.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v19, v16, v17
+        calc_epelh2     v4, v5, v18, v19, v16, v17
+        ld1            {v6.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x10
+        calc_epelh      v4, v19, v16, v17, v18
+        calc_epelh2     v4, v5, v19, v16, v17, v18
+        ld1            {v6.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv12_8_neon, export=1
+        add             x10, x5, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x5, #3
+        mov             x4, x6
+        mov             x5, x7
+        bl              X(ff_hevc_put_hevc_epel_h12_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x7, x6
+        sub             x1, x1, #8
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h, v17.8h}, [sp], x10
+        ld1            {v18.8h, v19.8h}, [sp], x10
+        ld1            {v20.8h, v21.8h}, [sp], x10
+1:      ld1            {v22.8h, v23.8h}, [sp], x10
+        calc_epelh      v4, v16, v18, v20, v22
+        calc_epelh2     v4, v5, v16, v18, v20, v22
+        calc_epelh      v5, v17, v19, v21, v23
+        ld1            {v6.8h, v7.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqadd           v5.8h, v5.8h, v7.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8h, v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v20, v22, v16
+        calc_epelh2     v4, v5, v18, v20, v22, v16
+        calc_epelh      v5, v19, v21, v23, v17
+        ld1            {v6.8h, v7.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqadd           v5.8h, v5.8h, v7.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.8h, v19.8h}, [sp], x10
+        calc_epelh      v4, v20, v22, v16, v18
+        calc_epelh2     v4, v5, v20, v22, v16, v18
+        calc_epelh      v5, v21, v23, v17, v19
+        ld1            {v6.8h, v7.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqadd           v5.8h, v5.8h, v7.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v20.8h, v21.8h}, [sp], x10
+        calc_epelh      v4, v22, v16, v18, v20
+        calc_epelh2     v4, v5, v22, v16, v18, v20
+        calc_epelh      v5, v23, v17, v19, v21
+        ld1            {v6.8h, v7.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqadd           v5.8h, v5.8h, v7.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv16_8_neon, export=1
+        add             x10, x5, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x5, #3
+        mov             x4, x6
+        mov             x5, x7
+        bl              X(ff_hevc_put_hevc_epel_h16_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x7, x6
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h, v17.8h}, [sp], x10
+        ld1            {v18.8h, v19.8h}, [sp], x10
+        ld1            {v20.8h, v21.8h}, [sp], x10
+1:      ld1            {v22.8h, v23.8h}, [sp], x10
+        calc_epelh      v4, v16, v18, v20, v22
+        calc_epelh2     v4, v5, v16, v18, v20, v22
+        calc_epelh      v5, v17, v19, v21, v23
+        calc_epelh2     v5, v6, v17, v19, v21, v23
+        ld1            {v6.8h, v7.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqadd           v5.8h, v5.8h, v7.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8h, v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v20, v22, v16
+        calc_epelh2     v4, v5, v18, v20, v22, v16
+        calc_epelh      v5, v19, v21, v23, v17
+        calc_epelh2     v5, v6, v19, v21, v23, v17
+        ld1            {v6.8h, v7.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqadd           v5.8h, v5.8h, v7.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.8h, v19.8h}, [sp], x10
+        calc_epelh      v4, v20, v22, v16, v18
+        calc_epelh2     v4, v5, v20, v22, v16, v18
+        calc_epelh      v5, v21, v23, v17, v19
+        calc_epelh2     v5, v6, v21, v23, v17, v19
+        ld1            {v6.8h, v7.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqadd           v5.8h, v5.8h, v7.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v20.8h, v21.8h}, [sp], x10
+        calc_epelh      v4, v22, v16, v18, v20
+        calc_epelh2     v4, v5, v22, v16, v18, v20
+        calc_epelh      v5, v23, v17, v19, v21
+        calc_epelh2     v5, v6, v23, v17, v19, v21
+        ld1            {v6.8h, v7.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqadd           v5.8h, v5.8h, v7.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv24_8_neon, export=1
+        add             x10, x5, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x5, #3
+        mov             x4, x6
+        mov             x5, x7
+        bl              X(ff_hevc_put_hevc_epel_h24_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x7, x6
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h, v17.8h, v18.8h}, [sp], x10
+        ld1            {v19.8h, v20.8h, v21.8h}, [sp], x10
+        ld1            {v22.8h, v23.8h, v24.8h}, [sp], x10
+1:      ld1            {v25.8h, v26.8h, v27.8h}, [sp], x10
+        calc_epelh      v1, v16, v19, v22, v25
+        calc_epelh2     v1, v2, v16, v19, v22, v25
+        calc_epelh      v2, v17, v20, v23, v26
+        calc_epelh2     v2, v3, v17, v20, v23, v26
+        calc_epelh      v3, v18, v21, v24, v27
+        calc_epelh2     v3, v4, v18, v21, v24, v27
+        ld1            {v4.8h, v5.8h, v6.8h}, [x4], x10
+        sqadd           v1.8h, v1.8h, v4.8h
+        sqadd           v2.8h, v2.8h, v5.8h
+        sqadd           v3.8h, v3.8h, v6.8h
+        sqrshrun        v1.8b, v1.8h, #7
+        sqrshrun        v2.8b, v2.8h, #7
+        sqrshrun        v3.8b, v3.8h, #7
+        st1            {v1.8b, v2.8b, v3.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8h, v17.8h, v18.8h}, [sp], x10
+        calc_epelh      v1, v19, v22, v25, v16
+        calc_epelh2     v1, v2, v19, v22, v25, v16
+        calc_epelh      v2, v20, v23, v26, v17
+        calc_epelh2     v2, v3, v20, v23, v26, v17
+        calc_epelh      v3, v21, v24, v27, v18
+        calc_epelh2     v3, v4, v21, v24, v27, v18
+        ld1            {v4.8h, v5.8h, v6.8h}, [x4], x10
+        sqadd           v1.8h, v1.8h, v4.8h
+        sqadd           v2.8h, v2.8h, v5.8h
+        sqadd           v3.8h, v3.8h, v6.8h
+        sqrshrun        v1.8b, v1.8h, #7
+        sqrshrun        v2.8b, v2.8h, #7
+        sqrshrun        v3.8b, v3.8h, #7
+        st1            {v1.8b, v2.8b, v3.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v19.8h, v20.8h, v21.8h}, [sp], x10
+        calc_epelh      v1, v22, v25, v16, v19
+        calc_epelh2     v1, v2, v22, v25, v16, v19
+        calc_epelh      v2, v23, v26, v17, v20
+        calc_epelh2     v2, v3, v23, v26, v17, v20
+        calc_epelh      v3, v24, v27, v18, v21
+        calc_epelh2     v3, v4, v24, v27, v18, v21
+        ld1            {v4.8h, v5.8h, v6.8h}, [x4], x10
+        sqadd           v1.8h, v1.8h, v4.8h
+        sqadd           v2.8h, v2.8h, v5.8h
+        sqadd           v3.8h, v3.8h, v6.8h
+        sqrshrun        v1.8b, v1.8h, #7
+        sqrshrun        v2.8b, v2.8h, #7
+        sqrshrun        v3.8b, v3.8h, #7
+        st1            {v1.8b, v2.8b, v3.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v22.8h, v23.8h, v24.8h}, [sp], x10
+        calc_epelh      v1, v25, v16, v19, v22
+        calc_epelh2     v1, v2, v25, v16, v19, v22
+        calc_epelh      v2, v26, v17, v20, v23
+        calc_epelh2     v2, v3, v26, v17, v20, v23
+        calc_epelh      v3, v27, v18, v21, v24
+        calc_epelh2     v3, v4, v27, v18, v21, v24
+        ld1            {v4.8h, v5.8h, v6.8h}, [x4], x10
+        sqadd           v1.8h, v1.8h, v4.8h
+        sqadd           v2.8h, v2.8h, v5.8h
+        sqadd           v3.8h, v3.8h, v6.8h
+        sqrshrun        v1.8b, v1.8h, #7
+        sqrshrun        v2.8b, v2.8h, #7
+        sqrshrun        v3.8b, v3.8h, #7
+        st1            {v1.8b, v2.8b, v3.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv32_8_neon, export=1
+        sub             sp, sp, #16
+        st1            {v8.16b}, [sp]
+        add             x10, x5, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x5, #3
+        mov             x4, x6
+        mov             x5, x7
+        bl              X(ff_hevc_put_hevc_epel_h32_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x7, x6
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h, v17.8h, v18.8h, v19.8h}, [sp], x10
+        ld1            {v20.8h, v21.8h, v22.8h, v23.8h}, [sp], x10
+        ld1            {v24.8h, v25.8h, v26.8h, v27.8h}, [sp], x10
+1:      ld1            {v28.8h, v29.8h, v30.8h, v31.8h}, [sp], x10
+        calc_epelh      v1, v16, v20, v24, v28
+        calc_epelh2     v1, v2, v16, v20, v24, v28
+        calc_epelh      v2, v17, v21, v25, v29
+        calc_epelh2     v2, v3, v17, v21, v25, v29
+        calc_epelh      v3, v18, v22, v26, v30
+        calc_epelh2     v3, v4, v18, v22, v26, v30
+        calc_epelh      v4, v19, v23, v27, v31
+        calc_epelh2     v4, v5, v19, v23, v27, v31
+        ld1            {v5.8h, v6.8h, v7.8h, v8.8h}, [x4], x10
+        sqadd           v1.8h, v1.8h, v5.8h
+        sqadd           v2.8h, v2.8h, v6.8h
+        sqadd           v3.8h, v3.8h, v7.8h
+        sqadd           v4.8h, v4.8h, v8.8h
+        sqrshrun        v1.8b, v1.8h, #7
+        sqrshrun        v2.8b, v2.8h, #7
+        sqrshrun        v3.8b, v3.8h, #7
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v1.8b, v2.8b, v3.8b, v4.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8h, v17.8h, v18.8h, v19.8h}, [sp], x10
+        calc_epelh      v1, v20, v24, v28, v16
+        calc_epelh2     v1, v2, v20, v24, v28, v16
+        calc_epelh      v2, v21, v25, v29, v17
+        calc_epelh2     v2, v3, v21, v25, v29, v17
+        calc_epelh      v3, v22, v26, v30, v18
+        calc_epelh2     v3, v4, v22, v26, v30, v18
+        calc_epelh      v4, v23, v27, v31, v19
+        calc_epelh2     v4, v5, v23, v27, v31, v19
+        ld1            {v5.8h, v6.8h, v7.8h, v8.8h}, [x4], x10
+        sqadd           v1.8h, v1.8h, v5.8h
+        sqadd           v2.8h, v2.8h, v6.8h
+        sqadd           v3.8h, v3.8h, v7.8h
+        sqadd           v4.8h, v4.8h, v8.8h
+        sqrshrun        v1.8b, v1.8h, #7
+        sqrshrun        v2.8b, v2.8h, #7
+        sqrshrun        v3.8b, v3.8h, #7
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v1.8b, v2.8b, v3.8b, v4.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v20.8h, v21.8h, v22.8h, v23.8h}, [sp], x10
+        calc_epelh      v1, v24, v28, v16, v20
+        calc_epelh2     v1, v2, v24, v28, v16, v20
+        calc_epelh      v2, v25, v29, v17, v21
+        calc_epelh2     v2, v3, v25, v29, v17, v21
+        calc_epelh      v3, v26, v30, v18, v22
+        calc_epelh2     v3, v4, v26, v30, v18, v22
+        calc_epelh      v4, v27, v31, v19, v23
+        calc_epelh2     v4, v5, v27, v31, v19, v23
+        ld1            {v5.8h, v6.8h, v7.8h, v8.8h}, [x4], x10
+        sqadd           v1.8h, v1.8h, v5.8h
+        sqadd           v2.8h, v2.8h, v6.8h
+        sqadd           v3.8h, v3.8h, v7.8h
+        sqadd           v4.8h, v4.8h, v8.8h
+        sqrshrun        v1.8b, v1.8h, #7
+        sqrshrun        v2.8b, v2.8h, #7
+        sqrshrun        v3.8b, v3.8h, #7
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v1.8b, v2.8b, v3.8b, v4.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v24.8h, v25.8h, v26.8h, v27.8h}, [sp], x10
+        calc_epelh      v1, v28, v16, v20, v24
+        calc_epelh2     v1, v2, v28, v16, v20, v24
+        calc_epelh      v2, v29, v17, v21, v25
+        calc_epelh2     v2, v3, v29, v17, v21, v25
+        calc_epelh      v3, v30, v18, v22, v26
+        calc_epelh2     v3, v4, v30, v18, v22, v26
+        calc_epelh      v4, v31, v19, v23, v27
+        calc_epelh2     v4, v5, v31, v19, v23, v27
+        ld1            {v5.8h, v6.8h, v7.8h, v8.8h}, [x4], x10
+        sqadd           v1.8h, v1.8h, v5.8h
+        sqadd           v2.8h, v2.8h, v6.8h
+        sqadd           v3.8h, v3.8h, v7.8h
+        sqadd           v4.8h, v4.8h, v8.8h
+        sqrshrun        v1.8b, v1.8h, #7
+        sqrshrun        v2.8b, v2.8h, #7
+        sqrshrun        v3.8b, v3.8h, #7
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v1.8b, v2.8b, v3.8b, v4.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ld1            {v8.16b}, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv48_8_neon, export=1
+        stp             xzr, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x6, x7, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_bi_hv24_8_neon)
+        ldp             x6, x7, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        add             x0, x0, #24
+        add             x2, x2, #24
+        add             x4, x4, #48
+        bl              X(ff_hevc_put_hevc_epel_bi_hv24_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv64_8_neon, export=1
+        stp             xzr, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x6, x7, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_bi_hv32_8_neon)
+        ldp             x6, x7, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        add             x0, x0, #32
+        add             x2, x2, #32
+        add             x4, x4, #64
+        bl              X(ff_hevc_put_hevc_epel_bi_hv32_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index c785e46f79..0e107deea6 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -58,7 +58,103 @@  void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, uint8_t *_src,
                                   int16_t *sao_offset_val, int sao_left_class,
                                   int width, int height);
 
+#define NEON8_FNPROTO(fn, args) \
+    void ff_hevc_put_hevc_##fn##4_8_neon args; \
+    void ff_hevc_put_hevc_##fn##6_8_neon args; \
+    void ff_hevc_put_hevc_##fn##8_8_neon args; \
+    void ff_hevc_put_hevc_##fn##12_8_neon args; \
+    void ff_hevc_put_hevc_##fn##16_8_neon args; \
+    void ff_hevc_put_hevc_##fn##24_8_neon args; \
+    void ff_hevc_put_hevc_##fn##32_8_neon args; \
+    void ff_hevc_put_hevc_##fn##48_8_neon args; \
+    void ff_hevc_put_hevc_##fn##64_8_neon args; \
 
+NEON8_FNPROTO(pel_pixels, (int16_t *dst,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(pel_bi_pixels, (uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_h, (int16_t *dst,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_v, (int16_t *dst,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_hv, (int16_t *dst,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_uni_v, (uint8_t *dst,  ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_uni_hv, (uint8_t *dst, ptrdiff_t _dststride,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_bi_h, (uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_bi_v, (uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_bi_hv, (uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_h, (int16_t *dst,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_v, (int16_t *dst,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_hv, (int16_t *dst,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_h, (uint8_t *dst,  ptrdiff_t dststride,
+            uint8_t *src, ptrdiff_t srcstride,
+            int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst,  ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_hv, (uint8_t *dst,  ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_h, (uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_v, (uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_hv, (uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+        int height, intptr_t mx, intptr_t my, int width));
+
+#define NEON8_FNASSIGN(member, v, h, fn) \
+        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon;  \
+        member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon;  \
+        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon;  \
+        member[4][v][h] = ff_hevc_put_hevc_##fn##12_8_neon; \
+        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon; \
+        member[6][v][h] = ff_hevc_put_hevc_##fn##24_8_neon; \
+        member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon; \
+        member[8][v][h] = ff_hevc_put_hevc_##fn##48_8_neon; \
+        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon;
 
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
@@ -76,6 +172,28 @@  av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_8_neon;
         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_8_neon;
         c->sao_band_filter[0]          = ff_hevc_sao_band_filter_8x8_8_neon;
+
+        NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels);
+        NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h);
+        NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v);
+        NEON8_FNASSIGN(c->put_hevc_epel, 1, 1, epel_hv);
+        NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v);
+        NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 1, epel_uni_hv);
+        NEON8_FNASSIGN(c->put_hevc_epel_bi, 0, 0, pel_bi_pixels);
+        NEON8_FNASSIGN(c->put_hevc_epel_bi, 0, 1, epel_bi_h);
+        NEON8_FNASSIGN(c->put_hevc_epel_bi, 1, 0, epel_bi_v);
+        NEON8_FNASSIGN(c->put_hevc_epel_bi, 1, 1, epel_bi_hv);
+        NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels);
+        NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h);
+        NEON8_FNASSIGN(c->put_hevc_qpel, 1, 0, qpel_v);
+        NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 1, qpel_uni_h);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 0, qpel_uni_v);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv);
+        NEON8_FNASSIGN(c->put_hevc_qpel_bi, 0, 0, pel_bi_pixels);
+        NEON8_FNASSIGN(c->put_hevc_qpel_bi, 0, 1, qpel_bi_h);
+        NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 0, qpel_bi_v);
+        NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 1, qpel_bi_hv);
     }
     if (bit_depth == 10) {
         c->add_residual[0]             = ff_hevc_add_residual_4x4_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
new file mode 100644
index 0000000000..cc2e9c51f9
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -0,0 +1,5646 @@ 
+/* -*-arm64-*-
+ * vim: syntax=arm64asm
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#define MAX_PB_SIZE 64
+
+.Lqpel_filters:
+        .byte  0,  0,  0,  0,  0,  0, 0,  0
+        .byte -1,  4,-10, 58, 17, -5, 1,  0
+        .byte -1,  4,-11, 40, 40,-11, 4, -1
+        .byte  0,  1, -5, 17, 58,-10, 4, -1
+
+.macro load_qpel_filterb freg, xreg
+        adr             \xreg, .Lqpel_filters
+        add             \xreg, \xreg, \freg, lsl #3
+        ld4r           {v0.16b, v1.16b, v2.16b, v3.16b}, [\xreg], #4
+        ld4r           {v4.16b, v5.16b, v6.16b, v7.16b}, [\xreg]
+        neg             v0.16b, v0.16b
+        neg             v2.16b, v2.16b
+        neg             v5.16b, v5.16b
+        neg             v7.16b, v7.16b
+.endm
+
+.macro calc_qpelb dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umlsl           \dst\().8h, \src0\().8b, v0.8b
+        umlal           \dst\().8h, \src1\().8b, v1.8b
+        umlsl           \dst\().8h, \src2\().8b, v2.8b
+        umlal           \dst\().8h, \src3\().8b, v3.8b
+        umlal           \dst\().8h, \src4\().8b, v4.8b
+        umlsl           \dst\().8h, \src5\().8b, v5.8b
+        umlal           \dst\().8h, \src6\().8b, v6.8b
+        umlsl           \dst\().8h, \src7\().8b, v7.8b
+.endm
+
+.macro calc_qpelb2 dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umlsl2          \dst\().8h, \src0\().16b, v0.16b
+        umlal2          \dst\().8h, \src1\().16b, v1.16b
+        umlsl2          \dst\().8h, \src2\().16b, v2.16b
+        umlal2          \dst\().8h, \src3\().16b, v3.16b
+        umlal2          \dst\().8h, \src4\().16b, v4.16b
+        umlsl2          \dst\().8h, \src5\().16b, v5.16b
+        umlal2          \dst\().8h, \src6\().16b, v6.16b
+        umlsl2          \dst\().8h, \src7\().16b, v7.16b
+.endm
+
+.macro load_qpel_filterh freg, xreg
+        adr             \xreg, .Lqpel_filters
+        add             \xreg, \xreg, \freg, lsl #3
+        ld1            {v0.8b}, [\xreg]
+        sxtl            v0.8h, v0.8b
+.endm
+
+.macro calc_qpelh dst, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6
+        smull           \dst\().4s, \src0\().4h, v0.h[0]
+        smlal           \dst\().4s, \src1\().4h, v0.h[1]
+        smlal           \dst\().4s, \src2\().4h, v0.h[2]
+        smlal           \dst\().4s, \src3\().4h, v0.h[3]
+        smlal           \dst\().4s, \src4\().4h, v0.h[4]
+        smlal           \dst\().4s, \src5\().4h, v0.h[5]
+        smlal           \dst\().4s, \src6\().4h, v0.h[6]
+        smlal           \dst\().4s, \src7\().4h, v0.h[7]
+.ifeqs "\op", "sshr"
+        sshr            \dst\().4s, \dst\().4s, \shift
+.else
+        \op             \dst\().4h, \dst\().4s, \shift
+.endif
+.endm
+
+.macro calc_qpelh2 dst, dstt, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6
+        smull2          \dstt\().4s, \src0\().8h, v0.h[0]
+        smlal2          \dstt\().4s, \src1\().8h, v0.h[1]
+        smlal2          \dstt\().4s, \src2\().8h, v0.h[2]
+        smlal2          \dstt\().4s, \src3\().8h, v0.h[3]
+        smlal2          \dstt\().4s, \src4\().8h, v0.h[4]
+        smlal2          \dstt\().4s, \src5\().8h, v0.h[5]
+        smlal2          \dstt\().4s, \src6\().8h, v0.h[6]
+        smlal2          \dstt\().4s, \src7\().8h, v0.h[7]
+.ifeqs "\op", "sshr"
+        sshr            \dst\().4s, \dstt\().4s, \shift
+.else
+        \op             \dst\().8h, \dstt\().4s, \shift
+.endif
+.endm
+
+function ff_hevc_put_hevc_qpel_h4_8_neon, export=1
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        sub             x2, x2, #8
+        mov             x14, #(MAX_PB_SIZE * 2)
+1:      ld1            {v16.8b}, [x1], #8
+        ld1            {v17.s}[0], [x1], x2
+        ushr            v18.2d, v16.2d, #8
+        mov             v18.b[7], v17.b[0]
+        ushr            v19.2d, v18.2d, #8
+        mov             v19.b[7], v17.b[1]
+        ushr            v20.2d, v19.2d, #8
+        mov             v20.b[7], v17.b[2]
+        ushr            v21.2d, v20.2d, #8
+        ushr            v22.2d, v21.2d, #8
+        ushr            v23.2d, v22.2d, #8
+        ushr            v24.2d, v23.2d, #8
+        movi            v28.8h, #0
+        calc_qpelb      v28, v16, v18, v19, v20, v21, v22, v23, v24
+        st1            {v28.4h}, [x0], x14
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h6_8_neon, export=1
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        mov             x14, #(MAX_PB_SIZE * 2 - 8)
+1:      ld1            {v16.8b, v17.8b}, [x1], x2
+        ushr            v18.2d, v16.2d, #8
+        mov             v18.b[7], v17.b[0]
+        ushr            v19.2d, v18.2d, #8
+        mov             v19.b[7], v17.b[1]
+        ushr            v20.2d, v19.2d, #8
+        mov             v20.b[7], v17.b[2]
+        ushr            v21.2d, v20.2d, #8
+        mov             v21.b[7], v17.b[3]
+        ushr            v22.2d, v21.2d, #8
+        mov             v22.b[7], v17.b[4]
+        ushr            v23.2d, v22.2d, #8
+        ushr            v24.2d, v23.2d, #8
+        movi            v28.8h, #0
+        calc_qpelb      v28, v16, v18, v19, v20, v21, v22, v23, v24
+        st1            {v28.4h}, [x0], #8
+        st1            {v28.s}[2], [x0], x14
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h8_8_neon, export=1
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        mov             x14, #(MAX_PB_SIZE * 2)
+1:      ld1            {v16.8b, v17.8b}, [x1], x2
+        ushr            v18.2d, v16.2d, #8
+        mov             v18.b[7], v17.b[0]
+        ushr            v19.2d, v18.2d, #8
+        mov             v19.b[7], v17.b[1]
+        ushr            v20.2d, v19.2d, #8
+        mov             v20.b[7], v17.b[2]
+        ushr            v21.2d, v20.2d, #8
+        mov             v21.b[7], v17.b[3]
+        ushr            v22.2d, v21.2d, #8
+        mov             v22.b[7], v17.b[4]
+        ushr            v23.2d, v22.2d, #8
+        mov             v23.b[7], v17.b[5]
+        ushr            v24.2d, v23.2d, #8
+        mov             v24.b[7], v17.b[6]
+        movi            v28.8h, #0
+        calc_qpelb      v28, v16, v18, v19, v20, v21, v22, v23, v24
+        st1            {v28.8h}, [x0], x14
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h12_8_neon, export=1
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        sub             x2, x2, #16
+        mov             x14, #(MAX_PB_SIZE * 2 - 16)
+1:      ld2            {v16.8b, v17.8b}, [x1], #16
+        ld1            {v27.s}[0], [x1], x2
+        ushr            v18.2d, v16.2d, #8
+        ushr            v19.2d, v17.2d, #8
+        mov             v18.b[7], v27.b[0]
+        mov             v19.b[7], v27.b[1]
+        ushr            v20.2d, v18.2d, #8
+        ushr            v21.2d, v19.2d, #8
+        mov             v20.b[7], v27.b[2]
+        mov             v21.b[7], v27.b[3]
+        ushr            v22.2d, v20.2d, #8
+        ushr            v23.2d, v21.2d, #8
+        ushr            v24.2d, v22.2d, #8
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_qpelb      v28, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb      v29, v17, v18, v19, v20, v21, v22, v23, v24
+        zip1            v16.8h, v28.8h, v29.8h
+        zip2            v17.8h, v28.8h, v29.8h
+        st1            {v16.8h}, [x0], #16
+        st1            {v17.4h}, [x0], x14
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h16_8_neon, export=1
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        sub             x2, x2, #16
+        mov             x14, #(MAX_PB_SIZE * 2)
+1:      ld2            {v16.8b, v17.8b}, [x1], #16
+        ld1            {v27.8b}, [x1], x2
+        ushr            v18.2d, v16.2d, #8
+        ushr            v19.2d, v17.2d, #8
+        mov             v18.b[7], v27.b[0]
+        mov             v19.b[7], v27.b[1]
+        ushr            v20.2d, v18.2d, #8
+        ushr            v21.2d, v19.2d, #8
+        mov             v20.b[7], v27.b[2]
+        mov             v21.b[7], v27.b[3]
+        ushr            v22.2d, v20.2d, #8
+        ushr            v23.2d, v21.2d, #8
+        mov             v22.b[7], v27.b[4]
+        mov             v23.b[7], v27.b[5]
+        ushr            v24.2d, v22.2d, #8
+        mov             v24.b[7], v27.b[6]
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_qpelb      v28, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb      v29, v17, v18, v19, v20, v21, v22, v23, v24
+        st2            {v28.8h, v29.8h}, [x0], x14
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h24_8_neon, export=1
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        sub             x2, x2, #24
+        mov             x14, #(MAX_PB_SIZE * 2)
+1:      ld3            {v16.8b, v17.8b, v18.8b}, [x1], #24
+        ld1            {v27.8b}, [x1], x2
+        ushr            v19.2d, v16.2d, #8
+        ushr            v20.2d, v17.2d, #8
+        ushr            v21.2d, v18.2d, #8
+        mov             v19.b[7], v27.b[0]
+        mov             v20.b[7], v27.b[1]
+        mov             v21.b[7], v27.b[2]
+        ushr            v22.2d, v19.2d, #8
+        ushr            v23.2d, v20.2d, #8
+        ushr            v24.2d, v21.2d, #8
+        mov             v22.b[7], v27.b[3]
+        mov             v23.b[7], v27.b[4]
+        mov             v24.b[7], v27.b[5]
+        ushr            v25.2d, v22.2d, #8
+        mov             v25.b[7], v27.b[6]
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        movi            v30.8h, #0
+        calc_qpelb      v28, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb      v29, v17, v18, v19, v20, v21, v22, v23, v24
+        calc_qpelb      v30, v18, v19, v20, v21, v22, v23, v24, v25
+        st3            {v28.8h, v29.8h, v30.8h}, [x0], x14
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h32_8_neon, export=1
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        sub             x2, x2, #32
+        mov             x14, #(MAX_PB_SIZE * 2)
+1:      ld4            {v16.8b, v17.8b, v18.8b, v19.8b}, [x1], #32
+        ld1            {v27.8b}, [x1], x2
+        ushr            v20.2d, v16.2d, #8
+        ushr            v21.2d, v17.2d, #8
+        ushr            v22.2d, v18.2d, #8
+        ushr            v23.2d, v19.2d, #8
+        mov             v20.b[7], v27.b[0]
+        mov             v21.b[7], v27.b[1]
+        mov             v22.b[7], v27.b[2]
+        mov             v23.b[7], v27.b[3]
+        ushr            v24.2d, v20.2d, #8
+        ushr            v25.2d, v21.2d, #8
+        ushr            v26.2d, v22.2d, #8
+        mov             v24.b[7], v27.b[4]
+        mov             v25.b[7], v27.b[5]
+        mov             v26.b[7], v27.b[6]
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        movi            v30.8h, #0
+        movi            v31.8h, #0
+        calc_qpelb      v28, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb      v29, v17, v18, v19, v20, v21, v22, v23, v24
+        calc_qpelb      v30, v18, v19, v20, v21, v22, v23, v24, v25
+        calc_qpelb      v31, v19, v20, v21, v22, v23, v24, v25, v26
+        st4            {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], x14
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h48_8_neon, export=1
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        sub             x2, x2, #48
+        mov             x7, #24
+        mov             x14, #80
+1:      ld3            {v16.16b, v17.16b, v18.16b}, [x1], x7
+        ld1            {v26.8b}, [x1], x7
+        ld1            {v27.8b}, [x1], x2
+        ushr            v19.2d, v16.2d, #8
+        ushr            v20.2d, v17.2d, #8
+        ushr            v21.2d, v18.2d, #8
+        mov             v19.b[7], v26.b[0]
+        mov             v19.b[15], v27.b[0]
+        mov             v20.b[7], v26.b[1]
+        mov             v20.b[15], v27.b[1]
+        mov             v21.b[7], v26.b[2]
+        mov             v21.b[15], v27.b[2]
+        ushr            v22.2d, v19.2d, #8
+        ushr            v23.2d, v20.2d, #8
+        ushr            v24.2d, v21.2d, #8
+        mov             v22.b[7], v26.b[3]
+        mov             v22.b[15], v27.b[3]
+        mov             v23.b[7], v26.b[4]
+        mov             v23.b[15], v27.b[4]
+        mov             v24.b[7], v26.b[5]
+        mov             v24.b[15], v27.b[5]
+        ushr            v25.2d, v22.2d, #8
+        mov             v25.b[7], v26.b[6]
+        mov             v25.b[15], v27.b[6]
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        movi            v30.8h, #0
+        calc_qpelb      v28, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb      v29, v17, v18, v19, v20, v21, v22, v23, v24
+        calc_qpelb      v30, v18, v19, v20, v21, v22, v23, v24, v25
+        st3            {v28.8h, v29.8h, v30.8h}, [x0], #48
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        movi            v30.8h, #0
+        calc_qpelb2     v28, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb2     v29, v17, v18, v19, v20, v21, v22, v23, v24
+        calc_qpelb2     v30, v18, v19, v20, v21, v22, v23, v24, v25
+        st3            {v28.8h, v29.8h, v30.8h}, [x0], x14
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h64_8_neon, export=1
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        sub             x2, x2, #64
+        mov             x7, #32
+1:      ld4            {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x7
+        ld1            {v27.8b}, [x1], x7
+        ld1            {v28.8b}, [x1], x2
+        ushr            v20.2d, v16.2d, #8
+        ushr            v21.2d, v17.2d, #8
+        ushr            v22.2d, v18.2d, #8
+        ushr            v23.2d, v19.2d, #8
+        mov             v20.b[7], v27.b[0]
+        mov             v21.b[7], v27.b[1]
+        mov             v22.b[7], v27.b[2]
+        mov             v23.b[7], v27.b[3]
+        mov             v20.b[15], v28.b[0]
+        mov             v21.b[15], v28.b[1]
+        mov             v22.b[15], v28.b[2]
+        mov             v23.b[15], v28.b[3]
+        ushr            v24.2d, v20.2d, #8
+        ushr            v25.2d, v21.2d, #8
+        ushr            v26.2d, v22.2d, #8
+        mov             v24.b[7], v27.b[4]
+        mov             v25.b[7], v27.b[5]
+        mov             v26.b[7], v27.b[6]
+        mov             v24.b[15], v28.b[4]
+        mov             v25.b[15], v28.b[5]
+        mov             v26.b[15], v28.b[6]
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        movi            v30.8h, #0
+        movi            v31.8h, #0
+        calc_qpelb      v28, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb      v29, v17, v18, v19, v20, v21, v22, v23, v24
+        calc_qpelb      v30, v18, v19, v20, v21, v22, v23, v24, v25
+        calc_qpelb      v31, v19, v20, v21, v22, v23, v24, v25, v26
+        st4            {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], #64
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        movi            v30.8h, #0
+        movi            v31.8h, #0
+        calc_qpelb2     v28, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb2     v29, v17, v18, v19, v20, v21, v22, v23, v24
+        calc_qpelb2     v30, v18, v19, v20, v21, v22, v23, v24, v25
+        calc_qpelb2     v31, v19, v20, v21, v22, v23, v24, v25, v26
+        st4            {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], #64
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v4_8_neon, export=1
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ld1            {v16.s}[0], [x1], x2
+        ld1            {v17.s}[0], [x1], x2
+        ld1            {v18.s}[0], [x1], x2
+        ld1            {v19.s}[0], [x1], x2
+        ld1            {v20.s}[0], [x1], x2
+        ld1            {v21.s}[0], [x1], x2
+        ld1            {v22.s}[0], [x1], x2
+1:      ld1            {v23.s}[0], [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        st1            {v24.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.s}[0], [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        st1            {v24.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.s}[0], [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        st1            {v24.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.s}[0], [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        st1            {v24.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v19.s}[0], [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        st1            {v24.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.s}[0], [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        st1            {v24.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v21.s}[0], [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        st1            {v24.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.s}[0], [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        st1            {v24.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v6_8_neon, export=1
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        mov             x9, #(MAX_PB_SIZE * 2 - 8)
+        ld1            {v16.8b}, [x1], x2
+        ld1            {v17.8b}, [x1], x2
+        ld1            {v18.8b}, [x1], x2
+        ld1            {v19.8b}, [x1], x2
+        ld1            {v20.8b}, [x1], x2
+        ld1            {v21.8b}, [x1], x2
+        ld1            {v22.8b}, [x1], x2
+1:      ld1            {v23.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        st1            {v24.4h}, [x0], #8
+        st1            {v24.s}[2], [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        st1            {v24.4h}, [x0], #8
+        st1            {v24.s}[2], [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        st1            {v24.4h}, [x0], #8
+        st1            {v24.s}[2], [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        st1            {v24.4h}, [x0], #8
+        st1            {v24.s}[2], [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v19.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        st1            {v24.4h}, [x0], #8
+        st1            {v24.s}[2], [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        st1            {v24.4h}, [x0], #8
+        st1            {v24.s}[2], [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v21.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        st1            {v24.4h}, [x0], #8
+        st1            {v24.s}[2], [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        st1            {v24.4h}, [x0], #8
+        st1            {v24.s}[2], [x0], x9
+        subs            x3, x3, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v8_8_neon, export=1
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8b}, [x1], x2
+        ld1            {v17.8b}, [x1], x2
+        ld1            {v18.8b}, [x1], x2
+        ld1            {v19.8b}, [x1], x2
+        ld1            {v20.8b}, [x1], x2
+        ld1            {v21.8b}, [x1], x2
+        ld1            {v22.8b}, [x1], x2
+1:      ld1            {v23.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        st1            {v24.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        st1            {v24.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        st1            {v24.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        st1            {v24.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v19.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        st1            {v24.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        st1            {v24.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v21.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        st1            {v24.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        st1            {v24.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v12_8_neon, export=1
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        mov             x9, #(MAX_PB_SIZE * 2 - 16)
+        ld1            {v16.16b}, [x1], x2
+        ld1            {v17.16b}, [x1], x2
+        ld1            {v18.16b}, [x1], x2
+        ld1            {v19.16b}, [x1], x2
+        ld1            {v20.16b}, [x1], x2
+        ld1            {v21.16b}, [x1], x2
+        ld1            {v22.16b}, [x1], x2
+1:      ld1            {v23.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb2     v25, v16, v17, v18, v19, v20, v21, v22, v23
+        st1            {v24.8h}, [x0], #16
+        st1            {v25.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        calc_qpelb2     v25, v17, v18, v19, v20, v21, v22, v23, v16
+        st1            {v24.8h}, [x0], #16
+        st1            {v25.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        calc_qpelb2     v25, v18, v19, v20, v21, v22, v23, v16, v17
+        st1            {v24.8h}, [x0], #16
+        st1            {v25.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        calc_qpelb2     v25, v19, v20, v21, v22, v23, v16, v17, v18
+        st1            {v24.8h}, [x0], #16
+        st1            {v25.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v19.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        calc_qpelb2     v25, v20, v21, v22, v23, v16, v17, v18, v19
+        st1            {v24.8h}, [x0], #16
+        st1            {v25.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        calc_qpelb2     v25, v21, v22, v23, v16, v17, v18, v19, v20
+        st1            {v24.8h}, [x0], #16
+        st1            {v25.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v21.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        calc_qpelb2     v25, v22, v23, v16, v17, v18, v19, v20, v21
+        st1            {v24.8h}, [x0], #16
+        st1            {v25.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        calc_qpelb2     v25, v23, v16, v17, v18, v19, v20, v21, v22
+        st1            {v24.8h}, [x0], #16
+        st1            {v25.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v16_8_neon, export=1
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ld1            {v16.16b}, [x1], x2
+        ld1            {v17.16b}, [x1], x2
+        ld1            {v18.16b}, [x1], x2
+        ld1            {v19.16b}, [x1], x2
+        ld1            {v20.16b}, [x1], x2
+        ld1            {v21.16b}, [x1], x2
+        ld1            {v22.16b}, [x1], x2
+1:      ld1            {v23.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb2     v25, v16, v17, v18, v19, v20, v21, v22, v23
+        st1            {v24.8h, v25.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        calc_qpelb2     v25, v17, v18, v19, v20, v21, v22, v23, v16
+        st1            {v24.8h, v25.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        calc_qpelb2     v25, v18, v19, v20, v21, v22, v23, v16, v17
+        st1            {v24.8h, v25.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        calc_qpelb2     v25, v19, v20, v21, v22, v23, v16, v17, v18
+        st1            {v24.8h, v25.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v19.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        calc_qpelb2     v25, v20, v21, v22, v23, v16, v17, v18, v19
+        st1            {v24.8h, v25.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        calc_qpelb2     v25, v21, v22, v23, v16, v17, v18, v19, v20
+        st1            {v24.8h, v25.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v21.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        calc_qpelb2     v25, v22, v23, v16, v17, v18, v19, v20, v21
+        st1            {v24.8h, v25.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        calc_qpelb2     v25, v23, v16, v17, v18, v19, v20, v21, v22
+        st1            {v24.8h, v25.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+// todo: reads #32 bytes
+function ff_hevc_put_hevc_qpel_v24_8_neon, export=1
+        sub             sp, sp, #48
+        st1            {v8.16b, v9.16b, v10.16b}, [sp]
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ld1            {v16.16b, v17.16b}, [x1], x2
+        ld1            {v18.16b, v19.16b}, [x1], x2
+        ld1            {v20.16b, v21.16b}, [x1], x2
+        ld1            {v22.16b, v23.16b}, [x1], x2
+        ld1            {v24.16b, v25.16b}, [x1], x2
+        ld1            {v26.16b, v27.16b}, [x1], x2
+        ld1            {v28.16b, v29.16b}, [x1], x2
+1:      ld1            {v30.16b, v31.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        calc_qpelb      v8,  v16, v18, v20, v22, v24, v26, v28, v30
+        calc_qpelb2     v9,  v16, v18, v20, v22, v24, v26, v28, v30
+        calc_qpelb      v10, v17, v19, v21, v23, v25, v27, v29, v31
+        st1            {v8.8h, v9.8h, v10.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.16b, v17.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        calc_qpelb      v8,  v18, v20, v22, v24, v26, v28, v30, v16
+        calc_qpelb2     v9,  v18, v20, v22, v24, v26, v28, v30, v16
+        calc_qpelb      v10, v19, v21, v23, v25, v27, v29, v31, v17
+        st1            {v8.8h, v9.8h, v10.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.16b, v19.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        calc_qpelb      v8,  v20, v22, v24, v26, v28, v30, v16, v18
+        calc_qpelb2     v9,  v20, v22, v24, v26, v28, v30, v16, v18
+        calc_qpelb      v10, v21, v23, v25, v27, v29, v31, v17, v19
+        st1            {v8.8h, v9.8h, v10.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.16b, v21.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        calc_qpelb      v8,  v22, v24, v26, v28, v30, v16, v18, v20
+        calc_qpelb2     v9,  v22, v24, v26, v28, v30, v16, v18, v20
+        calc_qpelb      v10, v23, v25, v27, v29, v31, v17, v19, v21
+        st1            {v8.8h, v9.8h, v10.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.16b, v23.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        calc_qpelb      v8,  v24, v26, v28, v30, v16, v18, v20, v22
+        calc_qpelb2     v9,  v24, v26, v28, v30, v16, v18, v20, v22
+        calc_qpelb      v10, v25, v27, v29, v31, v17, v19, v21, v23
+        st1            {v8.8h, v9.8h, v10.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v24.16b, v25.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        calc_qpelb      v8,  v26, v28, v30, v16, v18, v20, v22, v24
+        calc_qpelb2     v9,  v26, v28, v30, v16, v18, v20, v22, v24
+        calc_qpelb      v10, v27, v29, v31, v17, v19, v21, v23, v25
+        st1            {v8.8h, v9.8h, v10.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v26.16b, v27.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        calc_qpelb      v8,  v28, v30, v16, v18, v20, v22, v24, v26
+        calc_qpelb2     v9,  v28, v30, v16, v18, v20, v22, v24, v26
+        calc_qpelb      v10, v29, v31, v17, v19, v21, v23, v25, v27
+        st1            {v8.8h, v9.8h, v10.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v28.16b, v29.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        calc_qpelb      v8,  v30, v16, v18, v20, v22, v24, v26, v28
+        calc_qpelb2     v9,  v30, v16, v18, v20, v22, v24, v26, v28
+        calc_qpelb      v10, v31, v17, v19, v21, v23, v25, v27, v29
+        st1            {v8.8h, v9.8h, v10.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.hi            1b
+2:      ld1            {v8.16b, v9.16b, v10.16b}, [sp], #48
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v32_8_neon, export=1
+        sub             sp, sp, #64
+        st1            {v8.16b, v9.16b, v10.16b, v11.16b}, [sp]
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ld1            {v16.16b, v17.16b}, [x1], x2
+        ld1            {v18.16b, v19.16b}, [x1], x2
+        ld1            {v20.16b, v21.16b}, [x1], x2
+        ld1            {v22.16b, v23.16b}, [x1], x2
+        ld1            {v24.16b, v25.16b}, [x1], x2
+        ld1            {v26.16b, v27.16b}, [x1], x2
+        ld1            {v28.16b, v29.16b}, [x1], x2
+1:      ld1            {v30.16b, v31.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v16, v18, v20, v22, v24, v26, v28, v30
+        calc_qpelb2     v9,  v16, v18, v20, v22, v24, v26, v28, v30
+        calc_qpelb      v10, v17, v19, v21, v23, v25, v27, v29, v31
+        calc_qpelb2     v11, v17, v19, v21, v23, v25, v27, v29, v31
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.16b, v17.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v18, v20, v22, v24, v26, v28, v30, v16
+        calc_qpelb2     v9,  v18, v20, v22, v24, v26, v28, v30, v16
+        calc_qpelb      v10, v19, v21, v23, v25, v27, v29, v31, v17
+        calc_qpelb2     v11, v19, v21, v23, v25, v27, v29, v31, v17
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.16b, v19.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v20, v22, v24, v26, v28, v30, v16, v18
+        calc_qpelb2     v9,  v20, v22, v24, v26, v28, v30, v16, v18
+        calc_qpelb      v10, v21, v23, v25, v27, v29, v31, v17, v19
+        calc_qpelb2     v11, v21, v23, v25, v27, v29, v31, v17, v19
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.16b, v21.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v22, v24, v26, v28, v30, v16, v18, v20
+        calc_qpelb2     v9,  v22, v24, v26, v28, v30, v16, v18, v20
+        calc_qpelb      v10, v23, v25, v27, v29, v31, v17, v19, v21
+        calc_qpelb2     v11, v23, v25, v27, v29, v31, v17, v19, v21
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.16b, v23.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v24, v26, v28, v30, v16, v18, v20, v22
+        calc_qpelb2     v9,  v24, v26, v28, v30, v16, v18, v20, v22
+        calc_qpelb      v10, v25, v27, v29, v31, v17, v19, v21, v23
+        calc_qpelb2     v11, v25, v27, v29, v31, v17, v19, v21, v23
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v24.16b, v25.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v26, v28, v30, v16, v18, v20, v22, v24
+        calc_qpelb2     v9,  v26, v28, v30, v16, v18, v20, v22, v24
+        calc_qpelb      v10, v27, v29, v31, v17, v19, v21, v23, v25
+        calc_qpelb2     v11, v27, v29, v31, v17, v19, v21, v23, v25
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v26.16b, v27.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v28, v30, v16, v18, v20, v22, v24, v26
+        calc_qpelb2     v9,  v28, v30, v16, v18, v20, v22, v24, v26
+        calc_qpelb      v10, v29, v31, v17, v19, v21, v23, v25, v27
+        calc_qpelb2     v11, v29, v31, v17, v19, v21, v23, v25, v27
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v28.16b, v29.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v30, v16, v18, v20, v22, v24, v26, v28
+        calc_qpelb2     v9,  v30, v16, v18, v20, v22, v24, v26, v28
+        calc_qpelb      v10, v31, v17, v19, v21, v23, v25, v27, v29
+        calc_qpelb2     v11, v31, v17, v19, v21, v23, v25, v27, v29
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.hi            1b
+2:      ld1            {v8.16b, v9.16b, v10.16b, v11.16b}, [sp], #64
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v48_8_neon, export=1
+        stp             x5, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_qpel_v24_8_neon)
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        ldr             x5, [sp]
+        add             x0, x0, #48
+        add             x1, x1, #24
+        bl              X(ff_hevc_put_hevc_qpel_v24_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v64_8_neon, export=1
+        sub             sp, sp, #64
+        st1            {v8.16b, v9.16b, v10.16b, v11.16b}, [sp]
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        mov             x9, #(MAX_PB_SIZE * 2)
+1:      mov             x11, x3         // height
+        mov             x10, x0         // dst
+        mov             x8, x1          // src
+
+        ld1            {v16.16b, v17.16b}, [x8], x2
+        ld1            {v18.16b, v19.16b}, [x8], x2
+        ld1            {v20.16b, v21.16b}, [x8], x2
+        ld1            {v22.16b, v23.16b}, [x8], x2
+        ld1            {v24.16b, v25.16b}, [x8], x2
+        ld1            {v26.16b, v27.16b}, [x8], x2
+        ld1            {v28.16b, v29.16b}, [x8], x2
+2:      ld1            {v30.16b, v31.16b}, [x8], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v16, v18, v20, v22, v24, v26, v28, v30
+        calc_qpelb2     v9,  v16, v18, v20, v22, v24, v26, v28, v30
+        calc_qpelb      v10, v17, v19, v21, v23, v25, v27, v29, v31
+        calc_qpelb2     v11, v17, v19, v21, v23, v25, v27, v29, v31
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x10], x9
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v16.16b, v17.16b}, [x8], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v18, v20, v22, v24, v26, v28, v30, v16
+        calc_qpelb2     v9,  v18, v20, v22, v24, v26, v28, v30, v16
+        calc_qpelb      v10, v19, v21, v23, v25, v27, v29, v31, v17
+        calc_qpelb2     v11, v19, v21, v23, v25, v27, v29, v31, v17
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x10], x9
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v18.16b, v19.16b}, [x8], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v20, v22, v24, v26, v28, v30, v16, v18
+        calc_qpelb2     v9,  v20, v22, v24, v26, v28, v30, v16, v18
+        calc_qpelb      v10, v21, v23, v25, v27, v29, v31, v17, v19
+        calc_qpelb2     v11, v21, v23, v25, v27, v29, v31, v17, v19
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x10], x9
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v20.16b, v21.16b}, [x8], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v22, v24, v26, v28, v30, v16, v18, v20
+        calc_qpelb2     v9,  v22, v24, v26, v28, v30, v16, v18, v20
+        calc_qpelb      v10, v23, v25, v27, v29, v31, v17, v19, v21
+        calc_qpelb2     v11, v23, v25, v27, v29, v31, v17, v19, v21
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x10], x9
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v22.16b, v23.16b}, [x8], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v24, v26, v28, v30, v16, v18, v20, v22
+        calc_qpelb2     v9,  v24, v26, v28, v30, v16, v18, v20, v22
+        calc_qpelb      v10, v25, v27, v29, v31, v17, v19, v21, v23
+        calc_qpelb2     v11, v25, v27, v29, v31, v17, v19, v21, v23
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x10], x9
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v24.16b, v25.16b}, [x8], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v26, v28, v30, v16, v18, v20, v22, v24
+        calc_qpelb2     v9,  v26, v28, v30, v16, v18, v20, v22, v24
+        calc_qpelb      v10, v27, v29, v31, v17, v19, v21, v23, v25
+        calc_qpelb2     v11, v27, v29, v31, v17, v19, v21, v23, v25
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x10], x9
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v26.16b, v27.16b}, [x8], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v28, v30, v16, v18, v20, v22, v24, v26
+        calc_qpelb2     v9,  v28, v30, v16, v18, v20, v22, v24, v26
+        calc_qpelb      v10, v29, v31, v17, v19, v21, v23, v25, v27
+        calc_qpelb2     v11, v29, v31, v17, v19, v21, v23, v25, v27
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x10], x9
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v28.16b, v29.16b}, [x8], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v30, v16, v18, v20, v22, v24, v26, v28
+        calc_qpelb2     v9,  v30, v16, v18, v20, v22, v24, v26, v28
+        calc_qpelb      v10, v31, v17, v19, v21, v23, v25, v27, v29
+        calc_qpelb2     v11, v31, v17, v19, v21, v23, v25, v27, v29
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x10], x9
+        subs            x11, x11, #1
+        b.hi            2b
+
+3:      add             x0, x0, #64
+        add             x1, x1, #32
+        subs            x6, x6, #32
+        b.hi            1b
+        ld1            {v8.16b, v9.16b, v10.16b, v11.16b}, [sp], #64
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv4_8_neon, export=1
+        add             x10, x3, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        add             x3, x3, #7
+        bl              X(ff_hevc_put_hevc_qpel_h4_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0, x3, [sp], #16
+        load_qpel_filterh x5, x4
+        mov             x7, #128
+        ld1            {v16.4h}, [sp], x7
+        ld1            {v17.4h}, [sp], x7
+        ld1            {v18.4h}, [sp], x7
+        ld1            {v19.4h}, [sp], x7
+        ld1            {v20.4h}, [sp], x7
+        ld1            {v21.4h}, [sp], x7
+        ld1            {v22.4h}, [sp], x7
+1:      ld1            {v23.4h}, [sp], x7
+        calc_qpelh      v1, v16, v17, v18, v19, v20, v21, v22, v23, sqshrn
+        st1            {v1.4h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.4h}, [sp], x7
+        calc_qpelh      v1, v17, v18, v19, v20, v21, v22, v23, v16, sqshrn
+        st1            {v1.4h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.4h}, [sp], x7
+        calc_qpelh      v1, v18, v19, v20, v21, v22, v23, v16, v17, sqshrn
+        st1            {v1.4h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.4h}, [sp], x7
+        calc_qpelh      v1, v19, v20, v21, v22, v23, v16, v17, v18, sqshrn
+        st1            {v1.4h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v19.4h}, [sp], x7
+        calc_qpelh      v1, v20, v21, v22, v23, v16, v17, v18, v19, sqshrn
+        st1            {v1.4h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.4h}, [sp], x7
+        calc_qpelh      v1, v21, v22, v23, v16, v17, v18, v19, v20, sqshrn
+        st1            {v1.4h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v21.4h}, [sp], x7
+        calc_qpelh      v1, v22, v23, v16, v17, v18, v19, v20, v21, sqshrn
+        st1            {v1.4h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.4h}, [sp], x7
+        calc_qpelh      v1, v23, v16, v17, v18, v19, v20, v21, v22, sqshrn
+        st1            {v1.4h}, [x0], x7
+        subs            x3, x3, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv6_8_neon, export=1
+        add             x10, x3, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        add             x3, x3, #7
+        bl              X(ff_hevc_put_hevc_qpel_h6_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0, x3, [sp], #16
+        load_qpel_filterh x5, x4
+        mov             x7, #128
+        mov             x8, #120
+        ld1            {v16.8h}, [sp], x7
+        ld1            {v17.8h}, [sp], x7
+        ld1            {v18.8h}, [sp], x7
+        ld1            {v19.8h}, [sp], x7
+        ld1            {v20.8h}, [sp], x7
+        ld1            {v21.8h}, [sp], x7
+        ld1            {v22.8h}, [sp], x7
+1:      ld1            {v23.8h}, [sp], x7
+        calc_qpelh      v1, v16, v17, v18, v19, v20, v21, v22, v23, sqshrn
+        calc_qpelh2     v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, sqshrn2
+        st1            {v1.4h}, [x0], #8
+        st1            {v1.s}[2], [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x7
+        calc_qpelh      v1, v17, v18, v19, v20, v21, v22, v23, v16, sqshrn
+        calc_qpelh2     v1, v2, v17, v18, v19, v20, v21, v22, v23, v16, sqshrn2
+        st1            {v1.4h}, [x0], #8
+        st1            {v1.s}[2], [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x7
+        calc_qpelh      v1, v18, v19, v20, v21, v22, v23, v16, v17, sqshrn
+        calc_qpelh2     v1, v2, v18, v19, v20, v21, v22, v23, v16, v17, sqshrn2
+        st1            {v1.4h}, [x0], #8
+        st1            {v1.s}[2], [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x7
+        calc_qpelh      v1, v19, v20, v21, v22, v23, v16, v17, v18, sqshrn
+        calc_qpelh2     v1, v2, v19, v20, v21, v22, v23, v16, v17, v18, sqshrn2
+        st1            {v1.4h}, [x0], #8
+        st1            {v1.s}[2], [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v19.8h}, [sp], x7
+        calc_qpelh      v1, v20, v21, v22, v23, v16, v17, v18, v19, sqshrn
+        calc_qpelh2     v1, v2, v20, v21, v22, v23, v16, v17, v18, v19, sqshrn2
+        st1            {v1.4h}, [x0], #8
+        st1            {v1.s}[2], [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.8h}, [sp], x7
+        calc_qpelh      v1, v21, v22, v23, v16, v17, v18, v19, v20, sqshrn
+        calc_qpelh2     v1, v2, v21, v22, v23, v16, v17, v18, v19, v20, sqshrn2
+        st1            {v1.4h}, [x0], #8
+        st1            {v1.s}[2], [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v21.8h}, [sp], x7
+        calc_qpelh      v1, v22, v23, v16, v17, v18, v19, v20, v21, sqshrn
+        calc_qpelh2     v1, v2, v22, v23, v16, v17, v18, v19, v20, v21, sqshrn2
+        st1            {v1.4h}, [x0], #8
+        st1            {v1.s}[2], [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.8h}, [sp], x7
+        calc_qpelh      v1, v23, v16, v17, v18, v19, v20, v21, v22, sqshrn
+        calc_qpelh2     v1, v2, v23, v16, v17, v18, v19, v20, v21, v22, sqshrn2
+        st1            {v1.4h}, [x0], #8
+        st1            {v1.s}[2], [x0], x8
+        subs            x3, x3, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv8_8_neon, export=1
+        add             x10, x3, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        add             x3, x3, #7
+        bl              X(ff_hevc_put_hevc_qpel_h8_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0, x3, [sp], #16
+        load_qpel_filterh x5, x4
+        mov             x7, #128
+        ld1            {v16.8h}, [sp], x7
+        ld1            {v17.8h}, [sp], x7
+        ld1            {v18.8h}, [sp], x7
+        ld1            {v19.8h}, [sp], x7
+        ld1            {v20.8h}, [sp], x7
+        ld1            {v21.8h}, [sp], x7
+        ld1            {v22.8h}, [sp], x7
+1:      ld1            {v23.8h}, [sp], x7
+        calc_qpelh      v1, v16, v17, v18, v19, v20, v21, v22, v23, sqshrn
+        calc_qpelh2     v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, sqshrn2
+        st1            {v1.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x7
+        calc_qpelh      v1, v17, v18, v19, v20, v21, v22, v23, v16, sqshrn
+        calc_qpelh2     v1, v2, v17, v18, v19, v20, v21, v22, v23, v16, sqshrn2
+        st1            {v1.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x7
+        calc_qpelh      v1, v18, v19, v20, v21, v22, v23, v16, v17, sqshrn
+        calc_qpelh2     v1, v2, v18, v19, v20, v21, v22, v23, v16, v17, sqshrn2
+        st1            {v1.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x7
+        calc_qpelh      v1, v19, v20, v21, v22, v23, v16, v17, v18, sqshrn
+        calc_qpelh2     v1, v2, v19, v20, v21, v22, v23, v16, v17, v18, sqshrn2
+        st1            {v1.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v19.8h}, [sp], x7
+        calc_qpelh      v1, v20, v21, v22, v23, v16, v17, v18, v19, sqshrn
+        calc_qpelh2     v1, v2, v20, v21, v22, v23, v16, v17, v18, v19, sqshrn2
+        st1            {v1.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.8h}, [sp], x7
+        calc_qpelh      v1, v21, v22, v23, v16, v17, v18, v19, v20, sqshrn
+        calc_qpelh2     v1, v2, v21, v22, v23, v16, v17, v18, v19, v20, sqshrn2
+        st1            {v1.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v21.8h}, [sp], x7
+        calc_qpelh      v1, v22, v23, v16, v17, v18, v19, v20, v21, sqshrn
+        calc_qpelh2     v1, v2, v22, v23, v16, v17, v18, v19, v20, v21, sqshrn2
+        st1            {v1.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.8h}, [sp], x7
+        calc_qpelh      v1, v23, v16, v17, v18, v19, v20, v21, v22, sqshrn
+        calc_qpelh2     v1, v2, v23, v16, v17, v18, v19, v20, v21, v22, sqshrn2
+        st1            {v1.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv12_8_neon, export=1
+        add             x10, x3, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        add             x3, x3, #7
+        bl              X(ff_hevc_put_hevc_qpel_h12_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0, x3, [sp], #16
+        load_qpel_filterh x5, x4
+        mov             x7, #128
+        mov             x8, #112
+        ld1            {v16.8h, v17.8h}, [sp], x7
+        ld1            {v18.8h, v19.8h}, [sp], x7
+        ld1            {v20.8h, v21.8h}, [sp], x7
+        ld1            {v22.8h, v23.8h}, [sp], x7
+        ld1            {v24.8h, v25.8h}, [sp], x7
+        ld1            {v26.8h, v27.8h}, [sp], x7
+        ld1            {v28.8h, v29.8h}, [sp], x7
+1:      ld1            {v30.8h, v31.8h}, [sp], x7
+        calc_qpelh      v1, v16, v18, v20, v22, v24, v26, v28, v30, sqshrn
+        calc_qpelh2     v1, v2, v16, v18, v20, v22, v24, v26, v28, v30, sqshrn2
+        calc_qpelh      v2, v17, v19, v21, v23, v25, v27, v29, v31, sqshrn
+        st1            {v1.8h}, [x0], #16
+        st1            {v2.4h}, [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8h, v17.8h}, [sp], x7
+        calc_qpelh      v1, v18, v20, v22, v24, v26, v28, v30, v16, sqshrn
+        calc_qpelh2     v1, v2, v18, v20, v22, v24, v26, v28, v30, v16, sqshrn2
+        calc_qpelh      v2, v19, v21, v23, v25, v27, v29, v31, v17, sqshrn
+        st1            {v1.8h}, [x0], #16
+        st1            {v2.4h}, [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.8h, v19.8h}, [sp], x7
+        calc_qpelh      v1, v20, v22, v24, v26, v28, v30, v16, v18, sqshrn
+        calc_qpelh2     v1, v2, v20, v22, v24, v26, v28, v30, v16, v18, sqshrn2
+        calc_qpelh      v2, v21, v23, v25, v27, v29, v31, v17, v19, sqshrn
+        st1            {v1.8h}, [x0], #16
+        st1            {v2.4h}, [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.8h, v21.8h}, [sp], x7
+        calc_qpelh      v1, v22, v24, v26, v28, v30, v16, v18, v20, sqshrn
+        calc_qpelh2     v1, v2, v22, v24, v26, v28, v30, v16, v18, v20, sqshrn2
+        calc_qpelh      v2, v23, v25, v27, v29, v31, v17, v19, v21, sqshrn
+        st1            {v1.8h}, [x0], #16
+        st1            {v2.4h}, [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.8h, v23.8h}, [sp], x7
+        calc_qpelh      v1, v24, v26, v28, v30, v16, v18, v20, v22, sqshrn
+        calc_qpelh2     v1, v2, v24, v26, v28, v30, v16, v18, v20, v22, sqshrn2
+        calc_qpelh      v2, v25, v27, v29, v31, v17, v19, v21, v23, sqshrn
+        st1            {v1.8h}, [x0], #16
+        st1            {v2.4h}, [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v24.8h, v25.8h}, [sp], x7
+        calc_qpelh      v1, v26, v28, v30, v16, v18, v20, v22, v24, sqshrn
+        calc_qpelh2     v1, v2, v26, v28, v30, v16, v18, v20, v22, v24, sqshrn2
+        calc_qpelh      v2, v27, v29, v31, v17, v19, v21, v23, v25, sqshrn
+        st1            {v1.8h}, [x0], #16
+        st1            {v2.4h}, [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v26.8h, v27.8h}, [sp], x7
+        calc_qpelh      v1, v28, v30, v16, v18, v20, v22, v24, v26, sqshrn
+        calc_qpelh2     v1, v2, v28, v30, v16, v18, v20, v22, v24, v26, sqshrn2
+        calc_qpelh      v2, v29, v31, v17, v19, v21, v23, v25, v27, sqshrn
+        st1            {v1.8h}, [x0], #16
+        st1            {v2.4h}, [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v28.8h, v29.8h}, [sp], x7
+        calc_qpelh      v1, v30, v16, v18, v20, v22, v24, v26, v28, sqshrn
+        calc_qpelh2     v1, v2, v30, v16, v18, v20, v22, v24, v26, v28, sqshrn2
+        calc_qpelh      v2, v31, v17, v19, v21, v23, v25, v27, v29, sqshrn
+        st1            {v1.8h}, [x0], #16
+        st1            {v2.4h}, [x0], x8
+        subs            x3, x3, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv16_8_neon, export=1
+        add             x10, x3, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        add             x3, x3, #7
+        bl              X(ff_hevc_put_hevc_qpel_h16_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0, x3, [sp], #16
+        load_qpel_filterh x5, x4
+        mov             x7, #128
+        ld1            {v16.8h, v17.8h}, [sp], x7
+        ld1            {v18.8h, v19.8h}, [sp], x7
+        ld1            {v20.8h, v21.8h}, [sp], x7
+        ld1            {v22.8h, v23.8h}, [sp], x7
+        ld1            {v24.8h, v25.8h}, [sp], x7
+        ld1            {v26.8h, v27.8h}, [sp], x7
+        ld1            {v28.8h, v29.8h}, [sp], x7
+1:      ld1            {v30.8h, v31.8h}, [sp], x7
+        calc_qpelh      v1, v16, v18, v20, v22, v24, v26, v28, v30, sqshrn
+        calc_qpelh2     v1, v2, v16, v18, v20, v22, v24, v26, v28, v30, sqshrn2
+        calc_qpelh      v2, v17, v19, v21, v23, v25, v27, v29, v31, sqshrn
+        calc_qpelh2     v2, v3, v17, v19, v21, v23, v25, v27, v29, v31, sqshrn2
+        st1            {v1.8h, v2.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8h, v17.8h}, [sp], x7
+        calc_qpelh      v1, v18, v20, v22, v24, v26, v28, v30, v16, sqshrn
+        calc_qpelh2     v1, v2, v18, v20, v22, v24, v26, v28, v30, v16, sqshrn2
+        calc_qpelh      v2, v19, v21, v23, v25, v27, v29, v31, v17, sqshrn
+        calc_qpelh2     v2, v3, v19, v21, v23, v25, v27, v29, v31, v17, sqshrn2
+        st1            {v1.8h, v2.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.8h, v19.8h}, [sp], x7
+        calc_qpelh      v1, v20, v22, v24, v26, v28, v30, v16, v18, sqshrn
+        calc_qpelh2     v1, v2, v20, v22, v24, v26, v28, v30, v16, v18, sqshrn2
+        calc_qpelh      v2, v21, v23, v25, v27, v29, v31, v17, v19, sqshrn
+        calc_qpelh2     v2, v3, v21, v23, v25, v27, v29, v31, v17, v19, sqshrn2
+        st1            {v1.8h, v2.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.8h, v21.8h}, [sp], x7
+        calc_qpelh      v1, v22, v24, v26, v28, v30, v16, v18, v20, sqshrn
+        calc_qpelh2     v1, v2, v22, v24, v26, v28, v30, v16, v18, v20, sqshrn2
+        calc_qpelh      v2, v23, v25, v27, v29, v31, v17, v19, v21, sqshrn
+        calc_qpelh2     v2, v3, v23, v25, v27, v29, v31, v17, v19, v21, sqshrn2
+        st1            {v1.8h, v2.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.8h, v23.8h}, [sp], x7
+        calc_qpelh      v1, v24, v26, v28, v30, v16, v18, v20, v22, sqshrn
+        calc_qpelh2     v1, v2, v24, v26, v28, v30, v16, v18, v20, v22, sqshrn2
+        calc_qpelh      v2, v25, v27, v29, v31, v17, v19, v21, v23, sqshrn
+        calc_qpelh2     v2, v3, v25, v27, v29, v31, v17, v19, v21, v23, sqshrn2
+        st1            {v1.8h, v2.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v24.8h, v25.8h}, [sp], x7
+        calc_qpelh      v1, v26, v28, v30, v16, v18, v20, v22, v24, sqshrn
+        calc_qpelh2     v1, v2, v26, v28, v30, v16, v18, v20, v22, v24, sqshrn2
+        calc_qpelh      v2, v27, v29, v31, v17, v19, v21, v23, v25, sqshrn
+        calc_qpelh2     v2, v3, v27, v29, v31, v17, v19, v21, v23, v25, sqshrn2
+        st1            {v1.8h, v2.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v26.8h, v27.8h}, [sp], x7
+        calc_qpelh      v1, v28, v30, v16, v18, v20, v22, v24, v26, sqshrn
+        calc_qpelh2     v1, v2, v28, v30, v16, v18, v20, v22, v24, v26, sqshrn2
+        calc_qpelh      v2, v29, v31, v17, v19, v21, v23, v25, v27, sqshrn
+        calc_qpelh2     v2, v3, v29, v31, v17, v19, v21, v23, v25, v27, sqshrn2
+        st1            {v1.8h, v2.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v28.8h, v29.8h}, [sp], x7
+        calc_qpelh      v1, v30, v16, v18, v20, v22, v24, v26, v28, sqshrn
+        calc_qpelh2     v1, v2, v30, v16, v18, v20, v22, v24, v26, v28, sqshrn2
+        calc_qpelh      v2, v31, v17, v19, v21, v23, v25, v27, v29, sqshrn
+        calc_qpelh2     v2, v3, v31, v17, v19, v21, v23, v25, v27, v29, sqshrn2
+        st1            {v1.8h, v2.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv24_8_neon, export=1
+        sub             sp, sp, #64
+        st1            {v8.16b, v9.16b, v10.16b, v11.16b}, [sp]
+        sub             sp, sp, #64
+        st1            {v12.16b, v13.16b, v14.16b, v15.16b}, [sp]
+        add             x10, x3, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        add             x3, x3, #7
+        bl              X(ff_hevc_put_hevc_qpel_h24_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0, x3, [sp], #16
+        load_qpel_filterh x5, x4
+        mov             x7, #128
+        ld1            {v8.8h, v9.8h, v10.8h}, [sp], x7
+        ld1            {v11.8h, v12.8h, v13.8h}, [sp], x7
+        ld1            {v14.8h, v15.8h, v16.8h}, [sp], x7
+        ld1            {v17.8h, v18.8h, v19.8h}, [sp], x7
+        ld1            {v20.8h, v21.8h, v22.8h}, [sp], x7
+        ld1            {v23.8h, v24.8h, v25.8h}, [sp], x7
+        ld1            {v26.8h, v27.8h, v28.8h}, [sp], x7
+1:      ld1            {v29.8h, v30.8h, v31.8h}, [sp], x7
+        calc_qpelh      v1, v8, v11, v14, v17, v20, v23, v26, v29, sqshrn
+        calc_qpelh2     v1, v2, v8, v11, v14, v17, v20, v23, v26, v29, sqshrn2
+        calc_qpelh      v2, v9, v12, v15, v18, v21, v24, v27, v30, sqshrn
+        calc_qpelh2     v2, v3, v9, v12, v15, v18, v21, v24, v27, v30, sqshrn2
+        calc_qpelh      v3, v10, v13, v16, v19, v22, v25, v28, v31, sqshrn
+        calc_qpelh2     v3, v4, v10, v13, v16, v19, v22, v25, v28, v31, sqshrn2
+        st1            {v1.8h, v2.8h, v3.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v8.8h, v9.8h, v10.8h}, [sp], x7
+        calc_qpelh      v1, v11, v14, v17, v20, v23, v26, v29, v8, sqshrn
+        calc_qpelh2     v1, v2, v11, v14, v17, v20, v23, v26, v29, v8, sqshrn2
+        calc_qpelh      v2, v12, v15, v18, v21, v24, v27, v30, v9, sqshrn
+        calc_qpelh2     v2, v3, v12, v15, v18, v21, v24, v27, v30, v9, sqshrn2
+        calc_qpelh      v3, v13, v16, v19, v22, v25, v28, v31, v10, sqshrn
+        calc_qpelh2     v3, v4, v13, v16, v19, v22, v25, v28, v31, v10, sqshrn2
+        st1            {v1.8h, v2.8h, v3.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v11.8h, v12.8h, v13.8h}, [sp], x7
+        calc_qpelh      v1, v14, v17, v20, v23, v26, v29, v8, v11, sqshrn
+        calc_qpelh2     v1, v2, v14, v17, v20, v23, v26, v29, v8, v11, sqshrn2
+        calc_qpelh      v2, v15, v18, v21, v24, v27, v30, v9, v12, sqshrn
+        calc_qpelh2     v2, v3, v15, v18, v21, v24, v27, v30, v9, v12, sqshrn2
+        calc_qpelh      v3, v16, v19, v22, v25, v28, v31, v10, v13, sqshrn
+        calc_qpelh2     v3, v4, v16, v19, v22, v25, v28, v31, v10, v13, sqshrn2
+        st1            {v1.8h, v2.8h, v3.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v14.8h, v15.8h, v16.8h}, [sp], x7
+        calc_qpelh      v1, v17, v20, v23, v26, v29, v8, v11, v14, sqshrn
+        calc_qpelh2     v1, v2, v17, v20, v23, v26, v29, v8, v11, v14, sqshrn2
+        calc_qpelh      v2, v18, v21, v24, v27, v30, v9, v12, v15, sqshrn
+        calc_qpelh2     v2, v3, v18, v21, v24, v27, v30, v9, v12, v15, sqshrn2
+        calc_qpelh      v3, v19, v22, v25, v28, v31, v10, v13, v16, sqshrn
+        calc_qpelh2     v3, v4, v19, v22, v25, v28, v31, v10, v13, v16, sqshrn2
+        st1            {v1.8h, v2.8h, v3.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.8h, v18.8h, v19.8h}, [sp], x7
+        calc_qpelh      v1, v20, v23, v26, v29, v8, v11, v14, v17, sqshrn
+        calc_qpelh2     v1, v2, v20, v23, v26, v29, v8, v11, v14, v17, sqshrn2
+        calc_qpelh      v2, v21, v24, v27, v30, v9, v12, v15, v18, sqshrn
+        calc_qpelh2     v2, v3, v21, v24, v27, v30, v9, v12, v15, v18, sqshrn2
+        calc_qpelh      v3, v22, v25, v28, v31, v10, v13, v16, v19, sqshrn
+        calc_qpelh2     v3, v4, v22, v25, v28, v31, v10, v13, v16, v19, sqshrn2
+        st1            {v1.8h, v2.8h, v3.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.8h, v21.8h, v22.8h}, [sp], x7
+        calc_qpelh      v1, v23, v26, v29, v8, v11, v14, v17, v20, sqshrn
+        calc_qpelh2     v1, v2, v23, v26, v29, v8, v11, v14, v17, v20, sqshrn2
+        calc_qpelh      v2, v24, v27, v30, v9, v12, v15, v18, v21, sqshrn
+        calc_qpelh2     v2, v3, v24, v27, v30, v9, v12, v15, v18, v21, sqshrn2
+        calc_qpelh      v3, v25, v28, v31, v10, v13, v16, v19, v22, sqshrn
+        calc_qpelh2     v3, v4, v25, v28, v31, v10, v13, v16, v19, v22, sqshrn2
+        st1            {v1.8h, v2.8h, v3.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v23.8h, v24.8h, v25.8h}, [sp], x7
+        calc_qpelh      v1, v26, v29, v8, v11, v14, v17, v20, v23, sqshrn
+        calc_qpelh2     v1, v2, v26, v29, v8, v11, v14, v17, v20, v23, sqshrn2
+        calc_qpelh      v2, v27, v30, v9, v12, v15, v18, v21, v24, sqshrn
+        calc_qpelh2     v2, v3, v27, v30, v9, v12, v15, v18, v21, v24, sqshrn2
+        calc_qpelh      v3, v28, v31, v10, v13, v16, v19, v22, v25, sqshrn
+        calc_qpelh2     v3, v4, v28, v31, v10, v13, v16, v19, v22, v25, sqshrn2
+        st1            {v1.8h, v2.8h, v3.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v26.8h, v27.8h, v28.8h}, [sp], x7
+        calc_qpelh      v1, v29, v8, v11, v14, v17, v20, v23, v26, sqshrn
+        calc_qpelh2     v1, v2, v29, v8, v11, v14, v17, v20, v23, v26, sqshrn2
+        calc_qpelh      v2, v30, v9, v12, v15, v18, v21, v24, v27, sqshrn
+        calc_qpelh2     v2, v3, v30, v9, v12, v15, v18, v21, v24, v27, sqshrn2
+        calc_qpelh      v3, v31, v10, v13, v16, v19, v22, v25, v28, sqshrn
+        calc_qpelh2     v3, v4, v31, v10, v13, v16, v19, v22, v25, v28, sqshrn2
+        st1            {v1.8h, v2.8h, v3.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.hi            1b
+2:      ld1            {v12.16b, v13.16b, v14.16b, v15.16b}, [sp], #64
+        ld1            {v8.16b, v9.16b, v10.16b, v11.16b}, [sp], #64
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv32_8_neon, export=1
+        add             x10, x3, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        add             x3, x3, #7
+        bl              X(ff_hevc_put_hevc_qpel_h32_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0, x3, [sp], #16
+        load_qpel_filterh x5, x4
+        mov             x7, #128
+1:      mov             x9, x3          // height
+        mov             x5, x0          // dst
+        mov             x8, sp          // src
+
+        ld1            {v16.8h, v17.8h}, [x8], x7
+        ld1            {v18.8h, v19.8h}, [x8], x7
+        ld1            {v20.8h, v21.8h}, [x8], x7
+        ld1            {v22.8h, v23.8h}, [x8], x7
+        ld1            {v24.8h, v25.8h}, [x8], x7
+        ld1            {v26.8h, v27.8h}, [x8], x7
+        ld1            {v28.8h, v29.8h}, [x8], x7
+2:      ld1            {v30.8h, v31.8h}, [x8], x7
+        calc_qpelh      v1, v16, v18, v20, v22, v24, v26, v28, v30, sqshrn
+        calc_qpelh2     v1, v2, v16, v18, v20, v22, v24, v26, v28, v30, sqshrn2
+        calc_qpelh      v2, v17, v19, v21, v23, v25, v27, v29, v31, sqshrn
+        calc_qpelh2     v2, v3, v17, v19, v21, v23, v25, v27, v29, v31, sqshrn2
+        st1            {v1.8h, v2.8h}, [x5], x7
+        subs            x9, x9, #1
+        b.eq            3f
+
+        ld1            {v16.8h, v17.8h}, [x8], x7
+        calc_qpelh      v1, v18, v20, v22, v24, v26, v28, v30, v16, sqshrn
+        calc_qpelh2     v1, v2, v18, v20, v22, v24, v26, v28, v30, v16, sqshrn2
+        calc_qpelh      v2, v19, v21, v23, v25, v27, v29, v31, v17, sqshrn
+        calc_qpelh2     v2, v3, v19, v21, v23, v25, v27, v29, v31, v17, sqshrn2
+        st1            {v1.8h, v2.8h}, [x5], x7
+        subs            x9, x9, #1
+        b.eq            3f
+
+        ld1            {v18.8h, v19.8h}, [x8], x7
+        calc_qpelh      v1, v20, v22, v24, v26, v28, v30, v16, v18, sqshrn
+        calc_qpelh2     v1, v2, v20, v22, v24, v26, v28, v30, v16, v18, sqshrn2
+        calc_qpelh      v2, v21, v23, v25, v27, v29, v31, v17, v19, sqshrn
+        calc_qpelh2     v2, v3, v21, v23, v25, v27, v29, v31, v17, v19, sqshrn2
+        st1            {v1.8h, v2.8h}, [x5], x7
+        subs            x9, x9, #1
+        b.eq            3f
+
+        ld1            {v20.8h, v21.8h}, [x8], x7
+        calc_qpelh      v1, v22, v24, v26, v28, v30, v16, v18, v20, sqshrn
+        calc_qpelh2     v1, v2, v22, v24, v26, v28, v30, v16, v18, v20, sqshrn2
+        calc_qpelh      v2, v23, v25, v27, v29, v31, v17, v19, v21, sqshrn
+        calc_qpelh2     v2, v3, v23, v25, v27, v29, v31, v17, v19, v21, sqshrn2
+        st1            {v1.8h, v2.8h}, [x5], x7
+        subs            x9, x9, #1
+        b.eq            3f
+
+        ld1            {v22.8h, v23.8h}, [x8], x7
+        calc_qpelh      v1, v24, v26, v28, v30, v16, v18, v20, v22, sqshrn
+        calc_qpelh2     v1, v2, v24, v26, v28, v30, v16, v18, v20, v22, sqshrn2
+        calc_qpelh      v2, v25, v27, v29, v31, v17, v19, v21, v23, sqshrn
+        calc_qpelh2     v2, v3, v25, v27, v29, v31, v17, v19, v21, v23, sqshrn2
+        st1            {v1.8h, v2.8h}, [x5], x7
+        subs            x9, x9, #1
+        b.eq            3f
+
+        ld1            {v24.8h, v25.8h}, [x8], x7
+        calc_qpelh      v1, v26, v28, v30, v16, v18, v20, v22, v24, sqshrn
+        calc_qpelh2     v1, v2, v26, v28, v30, v16, v18, v20, v22, v24, sqshrn2
+        calc_qpelh      v2, v27, v29, v31, v17, v19, v21, v23, v25, sqshrn
+        calc_qpelh2     v2, v3, v27, v29, v31, v17, v19, v21, v23, v25, sqshrn2
+        st1            {v1.8h, v2.8h}, [x5], x7
+        subs            x9, x9, #1
+        b.eq            3f
+
+        ld1            {v26.8h, v27.8h}, [x8], x7
+        calc_qpelh      v1, v28, v30, v16, v18, v20, v22, v24, v26, sqshrn
+        calc_qpelh2     v1, v2, v28, v30, v16, v18, v20, v22, v24, v26, sqshrn2
+        calc_qpelh      v2, v29, v31, v17, v19, v21, v23, v25, v27, sqshrn
+        calc_qpelh2     v2, v3, v29, v31, v17, v19, v21, v23, v25, v27, sqshrn2
+        st1            {v1.8h, v2.8h}, [x5], x7
+        subs            x9, x9, #1
+        b.eq            3f
+
+        ld1            {v28.8h, v29.8h}, [x8], x7
+        calc_qpelh      v1, v30, v16, v18, v20, v22, v24, v26, v28, sqshrn
+        calc_qpelh2     v1, v2, v30, v16, v18, v20, v22, v24, v26, v28, sqshrn2
+        calc_qpelh      v2, v31, v17, v19, v21, v23, v25, v27, v29, sqshrn
+        calc_qpelh2     v2, v3, v31, v17, v19, v21, v23, v25, v27, v29, sqshrn2
+        st1            {v1.8h, v2.8h}, [x5], x7
+        subs            x9, x9, #1
+        b.hi            2b
+
+3:      add             x0, x0, #32
+        add             sp, sp, #32
+        subs            x6, x6, #16
+        b.hi            1b
+
+        add             sp, sp, #64          // discard rest of first line
+        add             x10, x3, #6
+        lsl             x10, x10, #7
+        add             sp, sp, x10         // tmp_array without first line
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv48_8_neon, export=1
+        stp             xzr, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_qpel_hv24_8_neon)
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        add             x0, x0, #48
+        add             x1, x1, #24
+        bl              X(ff_hevc_put_hevc_qpel_hv24_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv64_8_neon, export=1
+        stp             xzr, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        mov             x6, #32
+        bl              X(ff_hevc_put_hevc_qpel_hv32_8_neon)
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        add             x0, x0, #64
+        add             x1, x1, #32
+        mov             x6, #32
+        bl              X(ff_hevc_put_hevc_qpel_hv32_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h4_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+1:      ld1            {v16.8b, v17.8b}, [x2], x3
+        movi            v20.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[0]
+        umlal           v20.8h, v16.8b, v1.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[1]
+        umlsl           v20.8h, v16.8b, v2.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[2]
+        umlal           v20.8h, v16.8b, v3.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[3]
+        umlal           v20.8h, v16.8b, v4.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[4]
+        umlsl           v20.8h, v16.8b, v5.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[5]
+        umlal           v20.8h, v16.8b, v6.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[6]
+        umlsl           v20.8h, v16.8b, v7.8b
+        sqrshrun        v20.8b, v20.8h, #6
+        st1            {v20.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h6_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+        sub             x1, x1, #4
+1:      ld1            {v16.8b, v17.8b}, [x2], x3
+        movi            v20.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[0]
+        umlal           v20.8h, v16.8b, v1.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[1]
+        umlsl           v20.8h, v16.8b, v2.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[2]
+        umlal           v20.8h, v16.8b, v3.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[3]
+        umlal           v20.8h, v16.8b, v4.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[4]
+        umlsl           v20.8h, v16.8b, v5.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[5]
+        umlal           v20.8h, v16.8b, v6.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[6]
+        umlsl           v20.8h, v16.8b, v7.8b
+        sqrshrun        v20.8b, v20.8h, #6
+        st1            {v20.s}[0], [x0], #4
+        st1            {v20.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h8_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+1:      ld1            {v16.8b, v17.8b}, [x2], x3
+        movi            v20.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[0]
+        umlal           v20.8h, v16.8b, v1.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[1]
+        umlsl           v20.8h, v16.8b, v2.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[2]
+        umlal           v20.8h, v16.8b, v3.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[3]
+        umlal           v20.8h, v16.8b, v4.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[4]
+        umlsl           v20.8h, v16.8b, v5.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[5]
+        umlal           v20.8h, v16.8b, v6.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[6]
+        umlsl           v20.8h, v16.8b, v7.8b
+        sqrshrun        v20.8b, v20.8h, #6
+        st1            {v20.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h12_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+        sub             x1, x1, #8
+1:      ld2            {v16.8b, v17.8b}, [x2]
+        ldr             w12, [x2, #16]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        umlal           v20.8h, v17.8b, v1.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v21.8h, v17.8b, v0.8b
+        umlal           v21.8h, v16.8b, v1.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v20.8h, v16.8b, v2.8b
+        umlal           v20.8h, v17.8b, v3.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        umlsl           v21.8h, v17.8b, v2.8b
+        umlal           v21.8h, v16.8b, v3.8b
+        ushr            v17.2d, v17.2d, #8
+        umlal           v20.8h, v16.8b, v4.8b
+        umlsl           v20.8h, v17.8b, v5.8b
+        ushr            v16.2d, v16.2d, #8
+        umlal           v21.8h, v17.8b, v4.8b
+        umlsl           v21.8h, v16.8b, v5.8b
+        ushr            v17.2d, v17.2d, #8
+        umlal           v20.8h, v16.8b, v6.8b
+        umlsl           v20.8h, v17.8b, v7.8b
+        ushr            v16.2d, v16.2d, #8
+        umlal           v21.8h, v17.8b, v6.8b
+        umlsl           v21.8h, v16.8b, v7.8b
+        zip1            v16.8h, v20.8h, v21.8h
+        zip2            v17.8h, v20.8h, v21.8h
+        sqrshrun        v20.8b, v16.8h, #6
+        sqrshrun2       v20.16b, v17.8h, #6
+        st1            {v20.8b}, [x0], #8
+        st1            {v20.s}[2], [x0], x1
+        add             x2, x2, x3
+        subs            x4, x4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h16_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+1:      ld2            {v16.8b, v17.8b}, [x2]
+        ldr             x12, [x2, #16]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        umlal           v20.8h, v17.8b, v1.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v21.8h, v17.8b, v0.8b
+        umlal           v21.8h, v16.8b, v1.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v20.8h, v16.8b, v2.8b
+        umlal           v20.8h, v17.8b, v3.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v21.8h, v17.8b, v2.8b
+        umlal           v21.8h, v16.8b, v3.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v20.8h, v16.8b, v4.8b
+        umlsl           v20.8h, v17.8b, v5.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v21.8h, v17.8b, v4.8b
+        umlsl           v21.8h, v16.8b, v5.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v20.8h, v16.8b, v6.8b
+        umlsl           v20.8h, v17.8b, v7.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        umlal           v21.8h, v17.8b, v6.8b
+        umlsl           v21.8h, v16.8b, v7.8b
+        sqrshrun        v20.8b, v20.8h, #6
+        sqrshrun        v21.8b, v21.8h, #6
+        st2            {v20.8b, v21.8b}, [x0], x1
+        add             x2, x2, x3
+        subs            x4, x4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h24_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+1:      ld3            {v16.8b, v17.8b, v18.8b}, [x2]
+        ldr             x12, [x2, #24]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        umlal           v20.8h, v17.8b, v1.8b
+        umlsl           v20.8h, v18.8b, v2.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v21.8h, v17.8b, v0.8b
+        umlal           v21.8h, v18.8b, v1.8b
+        umlsl           v21.8h, v16.8b, v2.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v22.8h, v18.8b, v0.8b
+        umlal           v22.8h, v16.8b, v1.8b
+        umlsl           v22.8h, v17.8b, v2.8b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v20.8h, v16.8b, v3.8b
+        umlal           v20.8h, v17.8b, v4.8b
+        umlsl           v20.8h, v18.8b, v5.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v21.8h, v17.8b, v3.8b
+        umlal           v21.8h, v18.8b, v4.8b
+        umlsl           v21.8h, v16.8b, v5.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v22.8h, v18.8b, v3.8b
+        umlal           v22.8h, v16.8b, v4.8b
+        umlsl           v22.8h, v17.8b, v5.8b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v20.8h, v16.8b, v6.8b
+        umlsl           v20.8h, v17.8b, v7.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        umlal           v21.8h, v17.8b, v6.8b
+        umlsl           v21.8h, v18.8b, v7.8b
+        umlal           v22.8h, v18.8b, v6.8b
+        umlsl           v22.8h, v16.8b, v7.8b
+        sqrshrun        v20.8b, v20.8h, #6
+        sqrshrun        v21.8b, v21.8h, #6
+        sqrshrun        v22.8b, v22.8h, #6
+        st3            {v20.8b, v21.8b, v22.8b}, [x0], x1
+        add             x2, x2, x3
+        subs            x4, x4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h32_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+1:      ld4            {v16.8b, v17.8b, v18.8b, v19.8b}, [x2]
+        ldr             x12, [x2, #32]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        movi            v23.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        umlal           v20.8h, v17.8b, v1.8b
+        umlsl           v20.8h, v18.8b, v2.8b
+        umlal           v20.8h, v19.8b, v3.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v21.8h, v17.8b, v0.8b
+        umlal           v21.8h, v18.8b, v1.8b
+        umlsl           v21.8h, v19.8b, v2.8b
+        umlal           v21.8h, v16.8b, v3.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v22.8h, v18.8b, v0.8b
+        umlal           v22.8h, v19.8b, v1.8b
+        umlsl           v22.8h, v16.8b, v2.8b
+        umlal           v22.8h, v17.8b, v3.8b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v23.8h, v19.8b, v0.8b
+        umlal           v23.8h, v16.8b, v1.8b
+        umlsl           v23.8h, v17.8b, v2.8b
+        umlal           v23.8h, v18.8b, v3.8b
+        ushr            v19.2d, v19.2d, #8
+        mov             v19.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v20.8h, v16.8b, v4.8b
+        umlsl           v20.8h, v17.8b, v5.8b
+        umlal           v20.8h, v18.8b, v6.8b
+        umlsl           v20.8h, v19.8b, v7.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v21.8h, v17.8b, v4.8b
+        umlsl           v21.8h, v18.8b, v5.8b
+        umlal           v21.8h, v19.8b, v6.8b
+        umlsl           v21.8h, v16.8b, v7.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v22.8h, v18.8b, v4.8b
+        umlsl           v22.8h, v19.8b, v5.8b
+        umlal           v22.8h, v16.8b, v6.8b
+        umlsl           v22.8h, v17.8b, v7.8b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        umlal           v23.8h, v19.8b, v4.8b
+        umlsl           v23.8h, v16.8b, v5.8b
+        umlal           v23.8h, v17.8b, v6.8b
+        umlsl           v23.8h, v18.8b, v7.8b
+        sqrshrun        v20.8b, v20.8h, #6
+        sqrshrun        v21.8b, v21.8h, #6
+        sqrshrun        v22.8b, v22.8h, #6
+        sqrshrun        v23.8b, v23.8h, #6
+        st4            {v20.8b, v21.8b, v22.8b, v23.8b}, [x0], x1
+        add             x2, x2, x3
+        subs            x4, x4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h48_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+1:      ld3            {v16.16b, v17.16b, v18.16b}, [x2]
+        ldr             x12, [x2, #24]
+        ldr             x13, [x2, #48]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        movi            v23.8h, #0
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        umlal           v20.8h, v17.8b, v1.8b
+        umlsl           v20.8h, v18.8b, v2.8b
+        umlsl2          v23.8h, v16.16b, v0.16b
+        umlal2          v23.8h, v17.16b, v1.16b
+        umlsl2          v23.8h, v18.16b, v2.16b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        mov             v16.b[15], w13
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+        umlsl           v21.8h, v17.8b, v0.8b
+        umlal           v21.8h, v18.8b, v1.8b
+        umlsl           v21.8h, v16.8b, v2.8b
+        umlsl2          v24.8h, v17.16b, v0.16b
+        umlal2          v24.8h, v18.16b, v1.16b
+        umlsl2          v24.8h, v16.16b, v2.16b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        mov             v17.b[15], w13
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+        umlsl           v22.8h, v18.8b, v0.8b
+        umlal           v22.8h, v16.8b, v1.8b
+        umlsl           v22.8h, v17.8b, v2.8b
+        umlsl2          v25.8h, v18.16b, v0.16b
+        umlal2          v25.8h, v16.16b, v1.16b
+        umlsl2          v25.8h, v17.16b, v2.16b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        mov             v18.b[15], w13
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+        umlal           v20.8h, v16.8b, v3.8b
+        umlal           v20.8h, v17.8b, v4.8b
+        umlsl           v20.8h, v18.8b, v5.8b
+        umlal2          v23.8h, v16.16b, v3.16b
+        umlal2          v23.8h, v17.16b, v4.16b
+        umlsl2          v23.8h, v18.16b, v5.16b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        mov             v16.b[15], w13
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+        umlal           v21.8h, v17.8b, v3.8b
+        umlal           v21.8h, v18.8b, v4.8b
+        umlsl           v21.8h, v16.8b, v5.8b
+        umlal2          v24.8h, v17.16b, v3.16b
+        umlal2          v24.8h, v18.16b, v4.16b
+        umlsl2          v24.8h, v16.16b, v5.16b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        mov             v17.b[15], w13
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+        umlal           v22.8h, v18.8b, v3.8b
+        umlal           v22.8h, v16.8b, v4.8b
+        umlsl           v22.8h, v17.8b, v5.8b
+        umlal2          v25.8h, v18.16b, v3.16b
+        umlal2          v25.8h, v16.16b, v4.16b
+        umlsl2          v25.8h, v17.16b, v5.16b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        mov             v18.b[15], w13
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+        umlal           v20.8h, v16.8b, v6.8b
+        umlsl           v20.8h, v17.8b, v7.8b
+        umlal2          v23.8h, v16.16b, v6.16b
+        umlsl2          v23.8h, v17.16b, v7.16b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        mov             v16.b[15], w13
+        umlal           v21.8h, v17.8b, v6.8b
+        umlsl           v21.8h, v18.8b, v7.8b
+        umlal2          v24.8h, v17.16b, v6.16b
+        umlsl2          v24.8h, v18.16b, v7.16b
+        umlal           v22.8h, v18.8b, v6.8b
+        umlsl           v22.8h, v16.8b, v7.8b
+        umlal2          v25.8h, v18.16b, v6.16b
+        umlsl2          v25.8h, v16.16b, v7.16b
+        sqrshrun        v20.8b, v20.8h, #6
+        sqrshrun        v21.8b, v21.8h, #6
+        sqrshrun        v22.8b, v22.8h, #6
+        sqrshrun2       v20.16b, v23.8h, #6
+        sqrshrun2       v21.16b, v24.8h, #6
+        sqrshrun2       v22.16b, v25.8h, #6
+        st3            {v20.16b, v21.16b, v22.16b}, [x0], x1
+        add             x2, x2, x3
+        subs            x4, x4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h64_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+1:      ld4            {v16.16b, v17.16b, v18.16b, v19.16b}, [x2]
+        ldr             x12, [x2, #32]
+        ldr             x13, [x2, #64]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        movi            v23.8h, #0
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        movi            v26.8h, #0
+        movi            v27.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        umlal           v20.8h, v17.8b, v1.8b
+        umlsl           v20.8h, v18.8b, v2.8b
+        umlal           v20.8h, v19.8b, v3.8b
+        umlsl2          v24.8h, v16.16b, v0.16b
+        umlal2          v24.8h, v17.16b, v1.16b
+        umlsl2          v24.8h, v18.16b, v2.16b
+        umlal2          v24.8h, v19.16b, v3.16b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        mov             v16.b[15], w13
+        lsr             x13, x13, #8
+        umlsl           v21.8h, v17.8b, v0.8b
+        umlal           v21.8h, v18.8b, v1.8b
+        umlsl           v21.8h, v19.8b, v2.8b
+        umlal           v21.8h, v16.8b, v3.8b
+        umlsl2          v25.8h, v17.16b, v0.16b
+        umlal2          v25.8h, v18.16b, v1.16b
+        umlsl2          v25.8h, v19.16b, v2.16b
+        umlal2          v25.8h, v16.16b, v3.16b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        mov             v17.b[15], w13
+        lsr             x13, x13, #8
+        umlsl           v22.8h, v18.8b, v0.8b
+        umlal           v22.8h, v19.8b, v1.8b
+        umlsl           v22.8h, v16.8b, v2.8b
+        umlal           v22.8h, v17.8b, v3.8b
+        umlsl2          v26.8h, v18.16b, v0.16b
+        umlal2          v26.8h, v19.16b, v1.16b
+        umlsl2          v26.8h, v16.16b, v2.16b
+        umlal2          v26.8h, v17.16b, v3.16b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        lsr             x12, x12, #8
+        mov             v18.b[15], w13
+        lsr             x13, x13, #8
+        umlsl           v23.8h, v19.8b, v0.8b
+        umlal           v23.8h, v16.8b, v1.8b
+        umlsl           v23.8h, v17.8b, v2.8b
+        umlal           v23.8h, v18.8b, v3.8b
+        umlsl2          v27.8h, v19.16b, v0.16b
+        umlal2          v27.8h, v16.16b, v1.16b
+        umlsl2          v27.8h, v17.16b, v2.16b
+        umlal2          v27.8h, v18.16b, v3.16b
+        ushr            v19.2d, v19.2d, #8
+        mov             v19.b[7], w12
+        lsr             x12, x12, #8
+        mov             v19.b[15], w13
+        lsr             x13, x13, #8
+        umlal           v20.8h, v16.8b, v4.8b
+        umlsl           v20.8h, v17.8b, v5.8b
+        umlal           v20.8h, v18.8b, v6.8b
+        umlsl           v20.8h, v19.8b, v7.8b
+        umlal2          v24.8h, v16.16b, v4.16b
+        umlsl2          v24.8h, v17.16b, v5.16b
+        umlal2          v24.8h, v18.16b, v6.16b
+        umlsl2          v24.8h, v19.16b, v7.16b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        mov             v16.b[15], w13
+        lsr             x13, x13, #8
+        umlal           v21.8h, v17.8b, v4.8b
+        umlsl           v21.8h, v18.8b, v5.8b
+        umlal           v21.8h, v19.8b, v6.8b
+        umlsl           v21.8h, v16.8b, v7.8b
+        umlal2          v25.8h, v17.16b, v4.16b
+        umlsl2          v25.8h, v18.16b, v5.16b
+        umlal2          v25.8h, v19.16b, v6.16b
+        umlsl2          v25.8h, v16.16b, v7.16b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        mov             v17.b[15], w13
+        lsr             x13, x13, #8
+        umlal           v22.8h, v18.8b, v4.8b
+        umlsl           v22.8h, v19.8b, v5.8b
+        umlal           v22.8h, v16.8b, v6.8b
+        umlsl           v22.8h, v17.8b, v7.8b
+        umlal2          v26.8h, v18.16b, v4.16b
+        umlsl2          v26.8h, v19.16b, v5.16b
+        umlal2          v26.8h, v16.16b, v6.16b
+        umlsl2          v26.8h, v17.16b, v7.16b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        mov             v18.b[15], w13
+        umlal           v23.8h, v19.8b, v4.8b
+        umlsl           v23.8h, v16.8b, v5.8b
+        umlal           v23.8h, v17.8b, v6.8b
+        umlsl           v23.8h, v18.8b, v7.8b
+        umlal2          v27.8h, v19.16b, v4.16b
+        umlsl2          v27.8h, v16.16b, v5.16b
+        umlal2          v27.8h, v17.16b, v6.16b
+        umlsl2          v27.8h, v18.16b, v7.16b
+        sqrshrun        v20.8b, v20.8h, #6
+        sqrshrun        v21.8b, v21.8h, #6
+        sqrshrun        v22.8b, v22.8h, #6
+        sqrshrun        v23.8b, v23.8h, #6
+        sqrshrun2       v20.16b, v24.8h, #6
+        sqrshrun2       v21.16b, v25.8h, #6
+        sqrshrun2       v22.16b, v26.8h, #6
+        sqrshrun2       v23.16b, v27.8h, #6
+        st4            {v20.16b, v21.16b, v22.16b, v23.16b}, [x0], x1
+        add             x2, x2, x3
+        subs            x4, x4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v4_8_neon, export=1
+        load_qpel_filterb x6, x5
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        ld1            {v16.s}[0], [x2], x3
+        ld1            {v17.s}[0], [x2], x3
+        ld1            {v18.s}[0], [x2], x3
+        ld1            {v19.s}[0], [x2], x3
+        ld1            {v20.s}[0], [x2], x3
+        ld1            {v21.s}[0], [x2], x3
+        ld1            {v22.s}[0], [x2], x3
+1:      ld1            {v23.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v19.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v20.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v21.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v22.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v6_8_neon, export=1
+        load_qpel_filterb x6, x5
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        sub             x1, x1, #4
+        ld1            {v16.8b}, [x2], x3
+        ld1            {v17.8b}, [x2], x3
+        ld1            {v18.8b}, [x2], x3
+        ld1            {v19.8b}, [x2], x3
+        ld1            {v20.8b}, [x2], x3
+        ld1            {v21.8b}, [x2], x3
+        ld1            {v22.8b}, [x2], x3
+1:      ld1            {v23.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], #4
+        st1            {v24.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], #4
+        st1            {v24.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], #4
+        st1            {v24.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], #4
+        st1            {v24.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v19.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], #4
+        st1            {v24.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v20.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], #4
+        st1            {v24.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v21.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], #4
+        st1            {v24.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v22.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], #4
+        st1            {v24.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v8_8_neon, export=1
+        load_qpel_filterb x6, x5
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        ld1            {v16.8b}, [x2], x3
+        ld1            {v17.8b}, [x2], x3
+        ld1            {v18.8b}, [x2], x3
+        ld1            {v19.8b}, [x2], x3
+        ld1            {v20.8b}, [x2], x3
+        ld1            {v21.8b}, [x2], x3
+        ld1            {v22.8b}, [x2], x3
+1:      ld1            {v23.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v19.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v20.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v21.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v22.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v12_8_neon, export=1
+        load_qpel_filterb x6, x5
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        sub             x1, x1, #8
+1:      mov             x11, x4         // height
+        mov             x10, x0         // dst
+        mov             x8, x2          // src
+
+        ld1            {v16.16b}, [x8], x3
+        ld1            {v17.16b}, [x8], x3
+        ld1            {v18.16b}, [x8], x3
+        ld1            {v19.16b}, [x8], x3
+        ld1            {v20.16b}, [x8], x3
+        ld1            {v21.16b}, [x8], x3
+        ld1            {v22.16b}, [x8], x3
+2:      ld1            {v23.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb2     v25, v16, v17, v18, v19, v20, v21, v22, v23
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.8b}, [x10], #8
+        st1            {v24.s}[2], [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v16.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        calc_qpelb2     v25, v17, v18, v19, v20, v21, v22, v23, v16
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.8b}, [x10], #8
+        st1            {v24.s}[2], [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v17.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        calc_qpelb2     v25, v18, v19, v20, v21, v22, v23, v16, v17
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.8b}, [x10], #8
+        st1            {v24.s}[2], [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v18.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        calc_qpelb2     v25, v19, v20, v21, v22, v23, v16, v17, v18
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.8b}, [x10], #8
+        st1            {v24.s}[2], [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v19.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        calc_qpelb2     v25, v20, v21, v22, v23, v16, v17, v18, v19
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.8b}, [x10], #8
+        st1            {v24.s}[2], [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v20.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        calc_qpelb2     v25, v21, v22, v23, v16, v17, v18, v19, v20
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.8b}, [x10], #8
+        st1            {v24.s}[2], [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v21.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        calc_qpelb2     v25, v22, v23, v16, v17, v18, v19, v20, v21
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.8b}, [x10], #8
+        st1            {v24.s}[2], [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v22.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        calc_qpelb2     v25, v23, v16, v17, v18, v19, v20, v21, v22
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.8b}, [x10], #8
+        st1            {v24.s}[2], [x10], x1
+        subs            x11, x11, #1
+        b.hi            2b
+
+3:      add             x0, x0, #12
+        add             x2, x2, #12
+        subs            x7, x7, #12
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v16_8_neon, export=1
+        load_qpel_filterb x6, x5
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+1:      mov             x11, x4         // height
+        mov             x10, x0         // dst
+        mov             x8, x2          // src
+
+        ld1            {v16.16b}, [x8], x3
+        ld1            {v17.16b}, [x8], x3
+        ld1            {v18.16b}, [x8], x3
+        ld1            {v19.16b}, [x8], x3
+        ld1            {v20.16b}, [x8], x3
+        ld1            {v21.16b}, [x8], x3
+        ld1            {v22.16b}, [x8], x3
+2:      ld1            {v23.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb2     v25, v16, v17, v18, v19, v20, v21, v22, v23
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v16.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        calc_qpelb2     v25, v17, v18, v19, v20, v21, v22, v23, v16
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v17.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        calc_qpelb2     v25, v18, v19, v20, v21, v22, v23, v16, v17
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v18.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        calc_qpelb2     v25, v19, v20, v21, v22, v23, v16, v17, v18
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v19.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        calc_qpelb2     v25, v20, v21, v22, v23, v16, v17, v18, v19
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v20.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        calc_qpelb2     v25, v21, v22, v23, v16, v17, v18, v19, v20
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v21.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        calc_qpelb2     v25, v22, v23, v16, v17, v18, v19, v20, v21
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v22.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        calc_qpelb2     v25, v23, v16, v17, v18, v19, v20, v21, v22
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.hi            2b
+
+3:      add             x0, x0, #16
+        add             x2, x2, #16
+        subs            x7, x7, #16
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v24_8_neon, export=1
+        b X(ff_hevc_put_hevc_qpel_uni_v12_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v32_8_neon, export=1
+        b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v48_8_neon, export=1
+        b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v64_8_neon, export=1
+        b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv4_8_neon, export=1
+        add             x10, x4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             x30, xzr, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h4_8_neon)
+        ldp             x30, xzr, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_qpel_filterh x6, x5
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ld1            {v16.4h}, [sp], x9
+        ld1            {v17.4h}, [sp], x9
+        ld1            {v18.4h}, [sp], x9
+        ld1            {v19.4h}, [sp], x9
+        ld1            {v20.4h}, [sp], x9
+        ld1            {v21.4h}, [sp], x9
+        ld1            {v22.4h}, [sp], x9
+1:      ld1            {v23.4h}, [sp], x9
+        calc_qpelh      v1, v16, v17, v18, v19, v20, v21, v22, v23, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x9
+        calc_qpelh      v1, v17, v18, v19, v20, v21, v22, v23, v16, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x9
+        calc_qpelh      v1, v18, v19, v20, v21, v22, v23, v16, v17, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x9
+        calc_qpelh      v1, v19, v20, v21, v22, v23, v16, v17, v18, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v19.8h}, [sp], x9
+        calc_qpelh      v1, v20, v21, v22, v23, v16, v17, v18, v19, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v20.8h}, [sp], x9
+        calc_qpelh      v1, v21, v22, v23, v16, v17, v18, v19, v20, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v21.8h}, [sp], x9
+        calc_qpelh      v1, v22, v23, v16, v17, v18, v19, v20, v21, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v22.8h}, [sp], x9
+        calc_qpelh      v1, v23, v16, v17, v18, v19, v20, v21, v22, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv6_8_neon, export=1
+        add             x10, x4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             x30, xzr, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h6_8_neon)
+        ldp             x30, xzr, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_qpel_filterh x6, x5
+        sub             x1, x1, #4
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h}, [sp], x9
+        ld1            {v17.8h}, [sp], x9
+        ld1            {v18.8h}, [sp], x9
+        ld1            {v19.8h}, [sp], x9
+        ld1            {v20.8h}, [sp], x9
+        ld1            {v21.8h}, [sp], x9
+        ld1            {v22.8h}, [sp], x9
+1:      ld1            {v23.8h}, [sp], x9
+        calc_qpelh      v1, v16, v17, v18, v19, v20, v21, v22, v23, sqrshrn, #12
+        calc_qpelh2     v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x9
+        calc_qpelh      v1, v17, v18, v19, v20, v21, v22, v23, v16, sqrshrn, #12
+        calc_qpelh2     v1, v2, v17, v18, v19, v20, v21, v22, v23, v16, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x9
+        calc_qpelh      v1, v18, v19, v20, v21, v22, v23, v16, v17, sqrshrn, #12
+        calc_qpelh2     v1, v2, v18, v19, v20, v21, v22, v23, v16, v17, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x9
+        calc_qpelh      v1, v19, v20, v21, v22, v23, v16, v17, v18, sqrshrn, #12
+        calc_qpelh2     v1, v2, v19, v20, v21, v22, v23, v16, v17, v18, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v19.8h}, [sp], x9
+        calc_qpelh      v1, v20, v21, v22, v23, v16, v17, v18, v19, sqrshrn, #12
+        calc_qpelh2     v1, v2, v20, v21, v22, v23, v16, v17, v18, v19, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v20.8h}, [sp], x9
+        calc_qpelh      v1, v21, v22, v23, v16, v17, v18, v19, v20, sqrshrn, #12
+        calc_qpelh2     v1, v2, v21, v22, v23, v16, v17, v18, v19, v20, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v21.8h}, [sp], x9
+        calc_qpelh      v1, v22, v23, v16, v17, v18, v19, v20, v21, sqrshrn, #12
+        calc_qpelh2     v1, v2, v22, v23, v16, v17, v18, v19, v20, v21, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v22.8h}, [sp], x9
+        calc_qpelh      v1, v23, v16, v17, v18, v19, v20, v21, v22, sqrshrn, #12
+        calc_qpelh2     v1, v2, v23, v16, v17, v18, v19, v20, v21, v22, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv8_8_neon, export=1
+        add             x10, x4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             x30, xzr, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h8_8_neon)
+        ldp             x30, xzr, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_qpel_filterh x6, x5
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h}, [sp], x9
+        ld1            {v17.8h}, [sp], x9
+        ld1            {v18.8h}, [sp], x9
+        ld1            {v19.8h}, [sp], x9
+        ld1            {v20.8h}, [sp], x9
+        ld1            {v21.8h}, [sp], x9
+        ld1            {v22.8h}, [sp], x9
+1:      ld1            {v23.8h}, [sp], x9
+        calc_qpelh      v1, v16, v17, v18, v19, v20, v21, v22, v23, sqrshrn, #12
+        calc_qpelh2     v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x9
+        calc_qpelh      v1, v17, v18, v19, v20, v21, v22, v23, v16, sqrshrn, #12
+        calc_qpelh2     v1, v2, v17, v18, v19, v20, v21, v22, v23, v16, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x9
+        calc_qpelh      v1, v18, v19, v20, v21, v22, v23, v16, v17, sqrshrn, #12
+        calc_qpelh2     v1, v2, v18, v19, v20, v21, v22, v23, v16, v17, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x9
+        calc_qpelh      v1, v19, v20, v21, v22, v23, v16, v17, v18, sqrshrn, #12
+        calc_qpelh2     v1, v2, v19, v20, v21, v22, v23, v16, v17, v18, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v19.8h}, [sp], x9
+        calc_qpelh      v1, v20, v21, v22, v23, v16, v17, v18, v19, sqrshrn, #12
+        calc_qpelh2     v1, v2, v20, v21, v22, v23, v16, v17, v18, v19, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v20.8h}, [sp], x9
+        calc_qpelh      v1, v21, v22, v23, v16, v17, v18, v19, v20, sqrshrn, #12
+        calc_qpelh2     v1, v2, v21, v22, v23, v16, v17, v18, v19, v20, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v21.8h}, [sp], x9
+        calc_qpelh      v1, v22, v23, v16, v17, v18, v19, v20, v21, sqrshrn, #12
+        calc_qpelh2     v1, v2, v22, v23, v16, v17, v18, v19, v20, v21, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v22.8h}, [sp], x9
+        calc_qpelh      v1, v23, v16, v17, v18, v19, v20, v21, v22, sqrshrn, #12
+        calc_qpelh2     v1, v2, v23, v16, v17, v18, v19, v20, v21, v22, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv12_8_neon, export=1
+        add             x10, x4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h12_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_qpel_filterh x6, x5
+        sub             x1, x1, #8
+        mov             x9, #(MAX_PB_SIZE * 2)
+
+        ld1            {v16.8h, v17.8h}, [sp], x9
+        ld1            {v18.8h, v19.8h}, [sp], x9
+        ld1            {v20.8h, v21.8h}, [sp], x9
+        ld1            {v22.8h, v23.8h}, [sp], x9
+        ld1            {v24.8h, v25.8h}, [sp], x9
+        ld1            {v26.8h, v27.8h}, [sp], x9
+        ld1            {v28.8h, v29.8h}, [sp], x9
+1:      ld1            {v30.8h, v31.8h}, [sp], x9
+        calc_qpelh      v1, v16, v18, v20, v22, v24, v26, v28, v30, sqrshrn, #12
+        calc_qpelh2     v1, v2, v16, v18, v20, v22, v24, v26, v28, v30, sqrshrn2, #12
+        calc_qpelh      v2, v17, v19, v21, v23, v25, v27, v29, v31, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.8b}, [x0], #8
+        st1            {v1.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8h, v17.8h}, [sp], x9
+        calc_qpelh      v1, v18, v20, v22, v24, v26, v28, v30, v16, sqrshrn, #12
+        calc_qpelh2     v1, v2, v18, v20, v22, v24, v26, v28, v30, v16, sqrshrn2, #12
+        calc_qpelh      v2, v19, v21, v23, v25, v27, v29, v31, v17, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.8b}, [x0], #8
+        st1            {v1.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.8h, v19.8h}, [sp], x9
+        calc_qpelh      v1, v20, v22, v24, v26, v28, v30, v16, v18, sqrshrn, #12
+        calc_qpelh2     v1, v2, v20, v22, v24, v26, v28, v30, v16, v18, sqrshrn2, #12
+        calc_qpelh      v2, v21, v23, v25, v27, v29, v31, v17, v19, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.8b}, [x0], #8
+        st1            {v1.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v20.8h, v21.8h}, [sp], x9
+        calc_qpelh      v1, v22, v24, v26, v28, v30, v16, v18, v20, sqrshrn, #12
+        calc_qpelh2     v1, v2, v22, v24, v26, v28, v30, v16, v18, v20, sqrshrn2, #12
+        calc_qpelh      v2, v23, v25, v27, v29, v31, v17, v19, v21, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.8b}, [x0], #8
+        st1            {v1.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v22.8h, v23.8h}, [sp], x9
+        calc_qpelh      v1, v24, v26, v28, v30, v16, v18, v20, v22, sqrshrn, #12
+        calc_qpelh2     v1, v2, v24, v26, v28, v30, v16, v18, v20, v22, sqrshrn2, #12
+        calc_qpelh      v2, v25, v27, v29, v31, v17, v19, v21, v23, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.8b}, [x0], #8
+        st1            {v1.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v24.8h, v25.8h}, [sp], x9
+        calc_qpelh      v1, v26, v28, v30, v16, v18, v20, v22, v24, sqrshrn, #12
+        calc_qpelh2     v1, v2, v26, v28, v30, v16, v18, v20, v22, v24, sqrshrn2, #12
+        calc_qpelh      v2, v27, v29, v31, v17, v19, v21, v23, v25, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.8b}, [x0], #8
+        st1            {v1.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v26.8h, v27.8h}, [sp], x9
+        calc_qpelh      v1, v28, v30, v16, v18, v20, v22, v24, v26, sqrshrn, #12
+        calc_qpelh2     v1, v2, v28, v30, v16, v18, v20, v22, v24, v26, sqrshrn2, #12
+        calc_qpelh      v2, v29, v31, v17, v19, v21, v23, v25, v27, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.8b}, [x0], #8
+        st1            {v1.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v28.8h, v29.8h}, [sp], x9
+        calc_qpelh      v1, v30, v16, v18, v20, v22, v24, v26, v28, sqrshrn, #12
+        calc_qpelh2     v1, v2, v30, v16, v18, v20, v22, v24, v26, v28, sqrshrn2, #12
+        calc_qpelh      v2, v31, v17, v19, v21, v23, v25, v27, v29, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.8b}, [x0], #8
+        st1            {v1.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv16_8_neon, export=1
+        add             x10, x4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h16_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+.Lqpel_uni_hv16_loop:
+        load_qpel_filterh x6, x5
+        mov             x9, #(MAX_PB_SIZE * 2)
+        sub             x12, x9, x7, lsl #1
+1:      mov             x11, x4         // height
+        mov             x10, x0         // dst
+        mov             x8, sp          // src
+
+        ld1            {v16.8h, v17.8h}, [x8], x9
+        ld1            {v18.8h, v19.8h}, [x8], x9
+        ld1            {v20.8h, v21.8h}, [x8], x9
+        ld1            {v22.8h, v23.8h}, [x8], x9
+        ld1            {v24.8h, v25.8h}, [x8], x9
+        ld1            {v26.8h, v27.8h}, [x8], x9
+        ld1            {v28.8h, v29.8h}, [x8], x9
+2:      ld1            {v30.8h, v31.8h}, [x8], x9
+        calc_qpelh      v1, v16, v18, v20, v22, v24, v26, v28, v30, sqrshrn, #12
+        calc_qpelh2     v1, v2, v16, v18, v20, v22, v24, v26, v28, v30, sqrshrn2, #12
+        calc_qpelh      v2, v17, v19, v21, v23, v25, v27, v29, v31, sqrshrn, #12
+        calc_qpelh2     v2, v3, v17, v19, v21, v23, v25, v27, v29, v31, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v16.8h, v17.8h}, [x8], x9
+        calc_qpelh      v1, v18, v20, v22, v24, v26, v28, v30, v16, sqrshrn, #12
+        calc_qpelh2     v1, v2, v18, v20, v22, v24, v26, v28, v30, v16, sqrshrn2, #12
+        calc_qpelh      v2, v19, v21, v23, v25, v27, v29, v31, v17, sqrshrn, #12
+        calc_qpelh2     v2, v3, v19, v21, v23, v25, v27, v29, v31, v17, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v18.8h, v19.8h}, [x8], x9
+        calc_qpelh      v1, v20, v22, v24, v26, v28, v30, v16, v18, sqrshrn, #12
+        calc_qpelh2     v1, v2, v20, v22, v24, v26, v28, v30, v16, v18, sqrshrn2, #12
+        calc_qpelh      v2, v21, v23, v25, v27, v29, v31, v17, v19, sqrshrn, #12
+        calc_qpelh2     v2, v3, v21, v23, v25, v27, v29, v31, v17, v19, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v20.8h, v21.8h}, [x8], x9
+        calc_qpelh      v1, v22, v24, v26, v28, v30, v16, v18, v20, sqrshrn, #12
+        calc_qpelh2     v1, v2, v22, v24, v26, v28, v30, v16, v18, v20, sqrshrn2, #12
+        calc_qpelh      v2, v23, v25, v27, v29, v31, v17, v19, v21, sqrshrn, #12
+        calc_qpelh2     v2, v3, v23, v25, v27, v29, v31, v17, v19, v21, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v22.8h, v23.8h}, [x8], x9
+        calc_qpelh      v1, v24, v26, v28, v30, v16, v18, v20, v22, sqrshrn, #12
+        calc_qpelh2     v1, v2, v24, v26, v28, v30, v16, v18, v20, v22, sqrshrn2, #12
+        calc_qpelh      v2, v25, v27, v29, v31, v17, v19, v21, v23, sqrshrn, #12
+        calc_qpelh2     v2, v3, v25, v27, v29, v31, v17, v19, v21, v23, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v24.8h, v25.8h}, [x8], x9
+        calc_qpelh      v1, v26, v28, v30, v16, v18, v20, v22, v24, sqrshrn, #12
+        calc_qpelh2     v1, v2, v26, v28, v30, v16, v18, v20, v22, v24, sqrshrn2, #12
+        calc_qpelh      v2, v27, v29, v31, v17, v19, v21, v23, v25, sqrshrn, #12
+        calc_qpelh2     v2, v3, v27, v29, v31, v17, v19, v21, v23, v25, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v26.8h, v27.8h}, [x8], x9
+        calc_qpelh      v1, v28, v30, v16, v18, v20, v22, v24, v26, sqrshrn, #12
+        calc_qpelh2     v1, v2, v28, v30, v16, v18, v20, v22, v24, v26, sqrshrn2, #12
+        calc_qpelh      v2, v29, v31, v17, v19, v21, v23, v25, v27, sqrshrn, #12
+        calc_qpelh2     v2, v3, v29, v31, v17, v19, v21, v23, v25, v27, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v28.8h, v29.8h}, [x8], x9
+        calc_qpelh      v1, v30, v16, v18, v20, v22, v24, v26, v28, sqrshrn, #12
+        calc_qpelh2     v1, v2, v30, v16, v18, v20, v22, v24, v26, v28, sqrshrn2, #12
+        calc_qpelh      v2, v31, v17, v19, v21, v23, v25, v27, v29, sqrshrn, #12
+        calc_qpelh2     v2, v3, v31, v17, v19, v21, v23, v25, v27, v29, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.hi            2b
+
+3:      add             x0, x0, #16
+        add             sp, sp, #32
+        subs            x7, x7, #16
+        b.ne            1b
+        add             sp, sp, x12         // discard rest of first line
+        add             x10, x4, #6
+        lsl             x10, x10, #7
+        add             sp, sp, x10         // tmp_array without first line
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv24_8_neon, export=1
+        stp             x6, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        mov             x7, #16
+        bl              X(ff_hevc_put_hevc_qpel_uni_hv16_8_neon)
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        ldr             x6, [sp]
+        add             x0, x0, #16
+        add             x2, x2, #16
+        mov             x7, #8
+        bl              X(ff_hevc_put_hevc_qpel_uni_hv8_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv32_8_neon, export=1
+        add             x10, x4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h32_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        b .Lqpel_uni_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv48_8_neon, export=1
+        add             x10, x4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h48_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        b .Lqpel_uni_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv64_8_neon, export=1
+        add             x10, x4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h64_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        b .Lqpel_uni_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h4_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1            {v16.8b, v17.8b}, [x2], x3
+        movi            v20.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[0]
+        umlal           v20.8h, v16.8b, v1.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[1]
+        umlsl           v20.8h, v16.8b, v2.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[2]
+        umlal           v20.8h, v16.8b, v3.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[3]
+        umlal           v20.8h, v16.8b, v4.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[4]
+        umlsl           v20.8h, v16.8b, v5.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[5]
+        umlal           v20.8h, v16.8b, v6.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[6]
+        umlsl           v20.8h, v16.8b, v7.8b
+        ld1            {v24.8h}, [x4], x10
+        sqadd           v16.8h, v20.8h, v24.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        st1            {v16.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h6_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+        sub             x1, x1, #4
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1            {v16.8b, v17.8b}, [x2], x3
+        movi            v20.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[0]
+        umlal           v20.8h, v16.8b, v1.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[1]
+        umlsl           v20.8h, v16.8b, v2.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[2]
+        umlal           v20.8h, v16.8b, v3.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[3]
+        umlal           v20.8h, v16.8b, v4.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[4]
+        umlsl           v20.8h, v16.8b, v5.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[5]
+        umlal           v20.8h, v16.8b, v6.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[6]
+        umlsl           v20.8h, v16.8b, v7.8b
+        ld1            {v24.8h}, [x4], x10
+        sqadd           v16.8h, v20.8h, v24.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        st1            {v16.s}[0], [x0], #4
+        st1            {v16.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h8_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1            {v16.8b, v17.8b}, [x2], x3
+        movi            v20.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[0]
+        umlal           v20.8h, v16.8b, v1.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[1]
+        umlsl           v20.8h, v16.8b, v2.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[2]
+        umlal           v20.8h, v16.8b, v3.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[3]
+        umlal           v20.8h, v16.8b, v4.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[4]
+        umlsl           v20.8h, v16.8b, v5.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[5]
+        umlal           v20.8h, v16.8b, v6.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[6]
+        umlsl           v20.8h, v16.8b, v7.8b
+        ld1            {v24.8h}, [x4], x10
+        sqadd           v16.8h, v20.8h, v24.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        st1            {v16.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h12_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+        sub             x1, x1, #8
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld2            {v16.8b, v17.8b}, [x2]
+        ldr             w12, [x2, #16]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        umlal           v20.8h, v17.8b, v1.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v21.8h, v17.8b, v0.8b
+        umlal           v21.8h, v16.8b, v1.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v20.8h, v16.8b, v2.8b
+        umlal           v20.8h, v17.8b, v3.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        umlsl           v21.8h, v17.8b, v2.8b
+        umlal           v21.8h, v16.8b, v3.8b
+        ushr            v17.2d, v17.2d, #8
+        umlal           v20.8h, v16.8b, v4.8b
+        umlsl           v20.8h, v17.8b, v5.8b
+        ushr            v16.2d, v16.2d, #8
+        umlal           v21.8h, v17.8b, v4.8b
+        umlsl           v21.8h, v16.8b, v5.8b
+        ushr            v17.2d, v17.2d, #8
+        umlal           v20.8h, v16.8b, v6.8b
+        umlsl           v20.8h, v17.8b, v7.8b
+        ushr            v16.2d, v16.2d, #8
+        umlal           v21.8h, v17.8b, v6.8b
+        umlsl           v21.8h, v16.8b, v7.8b
+        ld2            {v24.8h, v25.8h}, [x4], x10
+        sqadd           v16.8h, v20.8h, v24.8h
+        sqadd           v17.8h, v21.8h, v25.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        sqrshrun        v17.8b, v17.8h, #7
+        zip1            v16.16b, v16.16b, v17.16b
+        st1            {v16.8b}, [x0], #8
+        st1            {v16.s}[2], [x0], x1
+        add             x2, x2, x3
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h16_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld2            {v16.8b, v17.8b}, [x2]
+        ldr             x12, [x2, #16]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        umlal           v20.8h, v17.8b, v1.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v21.8h, v17.8b, v0.8b
+        umlal           v21.8h, v16.8b, v1.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v20.8h, v16.8b, v2.8b
+        umlal           v20.8h, v17.8b, v3.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v21.8h, v17.8b, v2.8b
+        umlal           v21.8h, v16.8b, v3.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v20.8h, v16.8b, v4.8b
+        umlsl           v20.8h, v17.8b, v5.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v21.8h, v17.8b, v4.8b
+        umlsl           v21.8h, v16.8b, v5.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v20.8h, v16.8b, v6.8b
+        umlsl           v20.8h, v17.8b, v7.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        umlal           v21.8h, v17.8b, v6.8b
+        umlsl           v21.8h, v16.8b, v7.8b
+        ld2            {v24.8h, v25.8h}, [x4], x10
+        sqadd           v16.8h, v20.8h, v24.8h
+        sqadd           v17.8h, v21.8h, v25.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        sqrshrun        v17.8b, v17.8h, #7
+        st2            {v16.8b, v17.8b}, [x0], x1
+        add             x2, x2, x3
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h24_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x11, x7         // height
+1:      ld3            {v16.8b, v17.8b, v18.8b}, [x2]
+        ldr             x12, [x2, #24]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        umlal           v20.8h, v17.8b, v1.8b
+        umlsl           v20.8h, v18.8b, v2.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v21.8h, v17.8b, v0.8b
+        umlal           v21.8h, v18.8b, v1.8b
+        umlsl           v21.8h, v16.8b, v2.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v22.8h, v18.8b, v0.8b
+        umlal           v22.8h, v16.8b, v1.8b
+        umlsl           v22.8h, v17.8b, v2.8b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v20.8h, v16.8b, v3.8b
+        umlal           v20.8h, v17.8b, v4.8b
+        umlsl           v20.8h, v18.8b, v5.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v21.8h, v17.8b, v3.8b
+        umlal           v21.8h, v18.8b, v4.8b
+        umlsl           v21.8h, v16.8b, v5.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v22.8h, v18.8b, v3.8b
+        umlal           v22.8h, v16.8b, v4.8b
+        umlsl           v22.8h, v17.8b, v5.8b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v20.8h, v16.8b, v6.8b
+        umlsl           v20.8h, v17.8b, v7.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        umlal           v21.8h, v17.8b, v6.8b
+        umlsl           v21.8h, v18.8b, v7.8b
+        umlal           v22.8h, v18.8b, v6.8b
+        umlsl           v22.8h, v16.8b, v7.8b
+        ld3            {v23.8h, v24.8h, v25.8h}, [x4], x10
+        sqadd           v16.8h, v20.8h, v23.8h
+        sqadd           v17.8h, v21.8h, v24.8h
+        sqadd           v18.8h, v22.8h, v25.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        sqrshrun        v17.8b, v17.8h, #7
+        sqrshrun        v18.8b, v18.8h, #7
+        st3            {v16.8b, v17.8b, v18.8b}, [x0], x1
+        add             x2, x2, x3
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h32_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x11, x7         // height
+1:      ld4            {v16.8b, v17.8b, v18.8b, v19.8b}, [x2]
+        ldr             x12, [x2, #32]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        movi            v23.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        umlal           v20.8h, v17.8b, v1.8b
+        umlsl           v20.8h, v18.8b, v2.8b
+        umlal           v20.8h, v19.8b, v3.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v21.8h, v17.8b, v0.8b
+        umlal           v21.8h, v18.8b, v1.8b
+        umlsl           v21.8h, v19.8b, v2.8b
+        umlal           v21.8h, v16.8b, v3.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v22.8h, v18.8b, v0.8b
+        umlal           v22.8h, v19.8b, v1.8b
+        umlsl           v22.8h, v16.8b, v2.8b
+        umlal           v22.8h, v17.8b, v3.8b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v23.8h, v19.8b, v0.8b
+        umlal           v23.8h, v16.8b, v1.8b
+        umlsl           v23.8h, v17.8b, v2.8b
+        umlal           v23.8h, v18.8b, v3.8b
+        ushr            v19.2d, v19.2d, #8
+        mov             v19.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v20.8h, v16.8b, v4.8b
+        umlsl           v20.8h, v17.8b, v5.8b
+        umlal           v20.8h, v18.8b, v6.8b
+        umlsl           v20.8h, v19.8b, v7.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v21.8h, v17.8b, v4.8b
+        umlsl           v21.8h, v18.8b, v5.8b
+        umlal           v21.8h, v19.8b, v6.8b
+        umlsl           v21.8h, v16.8b, v7.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v22.8h, v18.8b, v4.8b
+        umlsl           v22.8h, v19.8b, v5.8b
+        umlal           v22.8h, v16.8b, v6.8b
+        umlsl           v22.8h, v17.8b, v7.8b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        umlal           v23.8h, v19.8b, v4.8b
+        umlsl           v23.8h, v16.8b, v5.8b
+        umlal           v23.8h, v17.8b, v6.8b
+        umlsl           v23.8h, v18.8b, v7.8b
+        ld4            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], x10
+        sqadd           v16.8h, v20.8h, v24.8h
+        sqadd           v17.8h, v21.8h, v25.8h
+        sqadd           v18.8h, v22.8h, v26.8h
+        sqadd           v19.8h, v23.8h, v27.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        sqrshrun        v17.8b, v17.8h, #7
+        sqrshrun        v18.8b, v18.8h, #7
+        sqrshrun        v19.8b, v19.8h, #7
+        st4            {v16.8b, v17.8b, v18.8b, v19.8b}, [x0], x1
+        add             x2, x2, x3
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h48_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+        mov             x10, #80
+        mov             x11, x7         // height
+1:      ld3            {v16.16b, v17.16b, v18.16b}, [x2]
+        ldr             x12, [x2, #24]
+        ldr             x13, [x2, #48]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        movi            v23.8h, #0
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        umlal           v20.8h, v17.8b, v1.8b
+        umlsl           v20.8h, v18.8b, v2.8b
+        umlsl2          v23.8h, v16.16b, v0.16b
+        umlal2          v23.8h, v17.16b, v1.16b
+        umlsl2          v23.8h, v18.16b, v2.16b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        mov             v16.b[15], w13
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+        umlsl           v21.8h, v17.8b, v0.8b
+        umlal           v21.8h, v18.8b, v1.8b
+        umlsl           v21.8h, v16.8b, v2.8b
+        umlsl2          v24.8h, v17.16b, v0.16b
+        umlal2          v24.8h, v18.16b, v1.16b
+        umlsl2          v24.8h, v16.16b, v2.16b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        mov             v17.b[15], w13
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+        umlsl           v22.8h, v18.8b, v0.8b
+        umlal           v22.8h, v16.8b, v1.8b
+        umlsl           v22.8h, v17.8b, v2.8b
+        umlsl2          v25.8h, v18.16b, v0.16b
+        umlal2          v25.8h, v16.16b, v1.16b
+        umlsl2          v25.8h, v17.16b, v2.16b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        mov             v18.b[15], w13
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+        umlal           v20.8h, v16.8b, v3.8b
+        umlal           v20.8h, v17.8b, v4.8b
+        umlsl           v20.8h, v18.8b, v5.8b
+        umlal2          v23.8h, v16.16b, v3.16b
+        umlal2          v23.8h, v17.16b, v4.16b
+        umlsl2          v23.8h, v18.16b, v5.16b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        mov             v16.b[15], w13
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+        umlal           v21.8h, v17.8b, v3.8b
+        umlal           v21.8h, v18.8b, v4.8b
+        umlsl           v21.8h, v16.8b, v5.8b
+        umlal2          v24.8h, v17.16b, v3.16b
+        umlal2          v24.8h, v18.16b, v4.16b
+        umlsl2          v24.8h, v16.16b, v5.16b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        mov             v17.b[15], w13
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+        umlal           v22.8h, v18.8b, v3.8b
+        umlal           v22.8h, v16.8b, v4.8b
+        umlsl           v22.8h, v17.8b, v5.8b
+        umlal2          v25.8h, v18.16b, v3.16b
+        umlal2          v25.8h, v16.16b, v4.16b
+        umlsl2          v25.8h, v17.16b, v5.16b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        mov             v18.b[15], w13
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+        umlal           v20.8h, v16.8b, v6.8b
+        umlsl           v20.8h, v17.8b, v7.8b
+        umlal2          v23.8h, v16.16b, v6.16b
+        umlsl2          v23.8h, v17.16b, v7.16b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        mov             v16.b[15], w13
+        umlal           v21.8h, v17.8b, v6.8b
+        umlsl           v21.8h, v18.8b, v7.8b
+        umlal2          v24.8h, v17.16b, v6.16b
+        umlsl2          v24.8h, v18.16b, v7.16b
+        umlal           v22.8h, v18.8b, v6.8b
+        umlsl           v22.8h, v16.8b, v7.8b
+        umlal2          v25.8h, v18.16b, v6.16b
+        umlsl2          v25.8h, v16.16b, v7.16b
+        ld3            {v26.8h, v27.8h, v28.8h}, [x4], #48
+        sqadd           v16.8h, v20.8h, v26.8h
+        sqadd           v17.8h, v21.8h, v27.8h
+        sqadd           v18.8h, v22.8h, v28.8h
+        ld3            {v26.8h, v27.8h, v28.8h}, [x4], x10
+        sqadd           v19.8h, v23.8h, v26.8h
+        sqadd           v20.8h, v24.8h, v27.8h
+        sqadd           v21.8h, v25.8h, v28.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        sqrshrun        v17.8b, v17.8h, #7
+        sqrshrun        v18.8b, v18.8h, #7
+        sqrshrun2       v16.16b, v19.8h, #7
+        sqrshrun2       v17.16b, v20.8h, #7
+        sqrshrun2       v18.16b, v21.8h, #7
+        st3            {v16.16b, v17.16b, v18.16b}, [x0], x1
+        add             x2, x2, x3
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h64_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+1:      ld4            {v16.16b, v17.16b, v18.16b, v19.16b}, [x2]
+        ldr             x12, [x2, #32]
+        ldr             x13, [x2, #64]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        movi            v23.8h, #0
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        movi            v26.8h, #0
+        movi            v27.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        umlal           v20.8h, v17.8b, v1.8b
+        umlsl           v20.8h, v18.8b, v2.8b
+        umlal           v20.8h, v19.8b, v3.8b
+        umlsl2          v24.8h, v16.16b, v0.16b
+        umlal2          v24.8h, v17.16b, v1.16b
+        umlsl2          v24.8h, v18.16b, v2.16b
+        umlal2          v24.8h, v19.16b, v3.16b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        mov             v16.b[15], w13
+        lsr             x13, x13, #8
+        umlsl           v21.8h, v17.8b, v0.8b
+        umlal           v21.8h, v18.8b, v1.8b
+        umlsl           v21.8h, v19.8b, v2.8b
+        umlal           v21.8h, v16.8b, v3.8b
+        umlsl2          v25.8h, v17.16b, v0.16b
+        umlal2          v25.8h, v18.16b, v1.16b
+        umlsl2          v25.8h, v19.16b, v2.16b
+        umlal2          v25.8h, v16.16b, v3.16b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        mov             v17.b[15], w13
+        lsr             x13, x13, #8
+        umlsl           v22.8h, v18.8b, v0.8b
+        umlal           v22.8h, v19.8b, v1.8b
+        umlsl           v22.8h, v16.8b, v2.8b
+        umlal           v22.8h, v17.8b, v3.8b
+        umlsl2          v26.8h, v18.16b, v0.16b
+        umlal2          v26.8h, v19.16b, v1.16b
+        umlsl2          v26.8h, v16.16b, v2.16b
+        umlal2          v26.8h, v17.16b, v3.16b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        lsr             x12, x12, #8
+        mov             v18.b[15], w13
+        lsr             x13, x13, #8
+        umlsl           v23.8h, v19.8b, v0.8b
+        umlal           v23.8h, v16.8b, v1.8b
+        umlsl           v23.8h, v17.8b, v2.8b
+        umlal           v23.8h, v18.8b, v3.8b
+        umlsl2          v27.8h, v19.16b, v0.16b
+        umlal2          v27.8h, v16.16b, v1.16b
+        umlsl2          v27.8h, v17.16b, v2.16b
+        umlal2          v27.8h, v18.16b, v3.16b
+        ushr            v19.2d, v19.2d, #8
+        mov             v19.b[7], w12
+        lsr             x12, x12, #8
+        mov             v19.b[15], w13
+        lsr             x13, x13, #8
+        umlal           v20.8h, v16.8b, v4.8b
+        umlsl           v20.8h, v17.8b, v5.8b
+        umlal           v20.8h, v18.8b, v6.8b
+        umlsl           v20.8h, v19.8b, v7.8b
+        umlal2          v24.8h, v16.16b, v4.16b
+        umlsl2          v24.8h, v17.16b, v5.16b
+        umlal2          v24.8h, v18.16b, v6.16b
+        umlsl2          v24.8h, v19.16b, v7.16b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        mov             v16.b[15], w13
+        lsr             x13, x13, #8
+        umlal           v21.8h, v17.8b, v4.8b
+        umlsl           v21.8h, v18.8b, v5.8b
+        umlal           v21.8h, v19.8b, v6.8b
+        umlsl           v21.8h, v16.8b, v7.8b
+        umlal2          v25.8h, v17.16b, v4.16b
+        umlsl2          v25.8h, v18.16b, v5.16b
+        umlal2          v25.8h, v19.16b, v6.16b
+        umlsl2          v25.8h, v16.16b, v7.16b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        mov             v17.b[15], w13
+        lsr             x13, x13, #8
+        umlal           v22.8h, v18.8b, v4.8b
+        umlsl           v22.8h, v19.8b, v5.8b
+        umlal           v22.8h, v16.8b, v6.8b
+        umlsl           v22.8h, v17.8b, v7.8b
+        umlal2          v26.8h, v18.16b, v4.16b
+        umlsl2          v26.8h, v19.16b, v5.16b
+        umlal2          v26.8h, v16.16b, v6.16b
+        umlsl2          v26.8h, v17.16b, v7.16b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        mov             v18.b[15], w13
+        umlal           v23.8h, v19.8b, v4.8b
+        umlsl           v23.8h, v16.8b, v5.8b
+        umlal           v23.8h, v17.8b, v6.8b
+        umlsl           v23.8h, v18.8b, v7.8b
+        umlal2          v27.8h, v19.16b, v4.16b
+        umlsl2          v27.8h, v16.16b, v5.16b
+        umlal2          v27.8h, v17.16b, v6.16b
+        umlsl2          v27.8h, v18.16b, v7.16b
+        ld4            {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
+        sqadd           v20.8h, v20.8h, v28.8h
+        sqadd           v21.8h, v21.8h, v29.8h
+        sqadd           v22.8h, v22.8h, v30.8h
+        sqadd           v23.8h, v23.8h, v31.8h
+        ld4            {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
+        sqadd           v24.8h, v24.8h, v28.8h
+        sqadd           v25.8h, v25.8h, v29.8h
+        sqadd           v26.8h, v26.8h, v30.8h
+        sqadd           v27.8h, v27.8h, v31.8h
+        sqrshrun        v16.8b, v20.8h, #7
+        sqrshrun        v17.8b, v21.8h, #7
+        sqrshrun        v18.8b, v22.8h, #7
+        sqrshrun        v19.8b, v23.8h, #7
+        sqrshrun2       v16.16b, v24.8h, #7
+        sqrshrun2       v17.16b, v25.8h, #7
+        sqrshrun2       v18.16b, v26.8h, #7
+        sqrshrun2       v19.16b, v27.8h, #7
+        st4            {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
+        add             x2, x2, x3
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v4_8_neon, export=1
+        load_qpel_filterb x7, x6
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        mov             x12, #(MAX_PB_SIZE * 2)
+        ld1            {v16.s}[0], [x2], x3
+        ld1            {v17.s}[0], [x2], x3
+        ld1            {v18.s}[0], [x2], x3
+        ld1            {v19.s}[0], [x2], x3
+        ld1            {v20.s}[0], [x2], x3
+        ld1            {v21.s}[0], [x2], x3
+        ld1            {v22.s}[0], [x2], x3
+1:      ld1            {v23.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        ld1            {v25.4h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        ld1            {v25.4h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        ld1            {v25.4h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        ld1            {v25.4h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v19.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        ld1            {v25.4h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v20.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        ld1            {v25.4h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v21.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        ld1            {v25.4h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v22.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        ld1            {v25.4h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v6_8_neon, export=1
+        load_qpel_filterb x7, x6
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        sub             x1, x1, #4
+        mov             x12, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8b}, [x2], x3
+        ld1            {v17.8b}, [x2], x3
+        ld1            {v18.8b}, [x2], x3
+        ld1            {v19.8b}, [x2], x3
+        ld1            {v20.8b}, [x2], x3
+        ld1            {v21.8b}, [x2], x3
+        ld1            {v22.8b}, [x2], x3
+1:      ld1            {v23.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], #4
+        st1            {v25.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], #4
+        st1            {v25.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], #4
+        st1            {v25.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], #4
+        st1            {v25.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v19.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], #4
+        st1            {v25.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v20.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], #4
+        st1            {v25.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v21.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], #4
+        st1            {v25.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v22.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], #4
+        st1            {v25.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v8_8_neon, export=1
+        load_qpel_filterb x7, x6
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        mov             x12, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8b}, [x2], x3
+        ld1            {v17.8b}, [x2], x3
+        ld1            {v18.8b}, [x2], x3
+        ld1            {v19.8b}, [x2], x3
+        ld1            {v20.8b}, [x2], x3
+        ld1            {v21.8b}, [x2], x3
+        ld1            {v22.8b}, [x2], x3
+1:      ld1            {v23.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v19.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v20.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v21.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v22.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v12_8_neon, export=1
+        load_qpel_filterb x7, x6
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        sub             x1, x1, #8
+        mov             x12, #(MAX_PB_SIZE * 2)
+        ld1            {v16.16b}, [x2], x3
+        ld1            {v17.16b}, [x2], x3
+        ld1            {v18.16b}, [x2], x3
+        ld1            {v19.16b}, [x2], x3
+        ld1            {v20.16b}, [x2], x3
+        ld1            {v21.16b}, [x2], x3
+        ld1            {v22.16b}, [x2], x3
+1:      ld1            {v23.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb2     v25, v16, v17, v18, v19, v20, v21, v22, v23
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.8b}, [x0], #8
+        st1            {v26.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        calc_qpelb2     v25, v17, v18, v19, v20, v21, v22, v23, v16
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.8b}, [x0], #8
+        st1            {v26.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        calc_qpelb2     v25, v18, v19, v20, v21, v22, v23, v16, v17
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.8b}, [x0], #8
+        st1            {v26.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        calc_qpelb2     v25, v19, v20, v21, v22, v23, v16, v17, v18
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.8b}, [x0], #8
+        st1            {v26.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v19.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        calc_qpelb2     v25, v20, v21, v22, v23, v16, v17, v18, v19
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.8b}, [x0], #8
+        st1            {v26.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v20.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        calc_qpelb2     v25, v21, v22, v23, v16, v17, v18, v19, v20
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.8b}, [x0], #8
+        st1            {v26.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v21.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        calc_qpelb2     v25, v22, v23, v16, v17, v18, v19, v20, v21
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.8b}, [x0], #8
+        st1            {v26.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v22.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        calc_qpelb2     v25, v23, v16, v17, v18, v19, v20, v21, v22
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.8b}, [x0], #8
+        st1            {v26.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v16_8_neon, export=1
+        load_qpel_filterb x7, x6
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        mov             x12, #(MAX_PB_SIZE * 2)
+        ld1            {v16.16b}, [x2], x3
+        ld1            {v17.16b}, [x2], x3
+        ld1            {v18.16b}, [x2], x3
+        ld1            {v19.16b}, [x2], x3
+        ld1            {v20.16b}, [x2], x3
+        ld1            {v21.16b}, [x2], x3
+        ld1            {v22.16b}, [x2], x3
+1:      ld1            {v23.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb2     v25, v16, v17, v18, v19, v20, v21, v22, v23
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        calc_qpelb2     v25, v17, v18, v19, v20, v21, v22, v23, v16
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        calc_qpelb2     v25, v18, v19, v20, v21, v22, v23, v16, v17
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        calc_qpelb2     v25, v19, v20, v21, v22, v23, v16, v17, v18
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v19.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        calc_qpelb2     v25, v20, v21, v22, v23, v16, v17, v18, v19
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v20.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        calc_qpelb2     v25, v21, v22, v23, v16, v17, v18, v19, v20
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v21.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        calc_qpelb2     v25, v22, v23, v16, v17, v18, v19, v20, v21
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v22.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        calc_qpelb2     v25, v23, v16, v17, v18, v19, v20, v21, v22
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v24_8_neon, export=1
+        stp             x7, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_qpel_bi_v16_8_neon)
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        ldr             x7, [sp]
+        add             x0, x0, #16
+        add             x2, x2, #16
+        add             x4, x4, #32
+        bl              X(ff_hevc_put_hevc_qpel_bi_v8_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v32_8_neon, export=1
+        sub             sp, sp, #64
+        st1            {v12.16b, v13.16b, v14.16b, v15.16b}, [sp]
+        sub             sp, sp, #64
+        st1            {v8.16b, v9.16b, v10.16b, v11.16b}, [sp]
+        load_qpel_filterb x7, x6
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        ldr             w6, [sp, #128]
+        mov             x12, #(MAX_PB_SIZE * 2)
+1:      mov             x11, x5         // height
+        mov             x10, x0         // dst
+        mov             x8, x2          // src
+        mov             x9, x4          // src2
+
+        ld1            {v16.16b, v17.16b}, [x8], x3
+        ld1            {v18.16b, v19.16b}, [x8], x3
+        ld1            {v20.16b, v21.16b}, [x8], x3
+        ld1            {v22.16b, v23.16b}, [x8], x3
+        ld1            {v24.16b, v25.16b}, [x8], x3
+        ld1            {v26.16b, v27.16b}, [x8], x3
+        ld1            {v28.16b, v29.16b}, [x8], x3
+2:      ld1            {v30.16b, v31.16b}, [x8], x3
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v16, v18, v20, v22, v24, v26, v28, v30
+        calc_qpelb2     v9,  v16, v18, v20, v22, v24, v26, v28, v30
+        calc_qpelb      v10, v17, v19, v21, v23, v25, v27, v29, v31
+        calc_qpelb2     v11, v17, v19, v21, v23, v25, v27, v29, v31
+        ld1            {v12.8h, v13.8h, v14.8h, v15.8h}, [x9], x12   // src2
+        sqadd           v8.8h, v8.8h, v12.8h
+        sqadd           v9.8h, v9.8h, v13.8h
+        sqadd           v10.8h, v10.8h, v14.8h
+        sqadd           v11.8h, v11.8h, v15.8h
+        sqrshrun        v12.8b, v8.8h, #7
+        sqrshrun2       v12.16b, v9.8h, #7
+        sqrshrun        v13.8b, v10.8h, #7
+        sqrshrun2       v13.16b, v11.8h, #7
+        st1            {v12.16b, v13.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v16.16b, v17.16b}, [x8], x3
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v18, v20, v22, v24, v26, v28, v30, v16
+        calc_qpelb2     v9,  v18, v20, v22, v24, v26, v28, v30, v16
+        calc_qpelb      v10, v19, v21, v23, v25, v27, v29, v31, v17
+        calc_qpelb2     v11, v19, v21, v23, v25, v27, v29, v31, v17
+        ld1            {v12.8h, v13.8h, v14.8h, v15.8h}, [x9], x12   // src2
+        sqadd           v8.8h, v8.8h, v12.8h
+        sqadd           v9.8h, v9.8h, v13.8h
+        sqadd           v10.8h, v10.8h, v14.8h
+        sqadd           v11.8h, v11.8h, v15.8h
+        sqrshrun        v12.8b, v8.8h, #7
+        sqrshrun2       v12.16b, v9.8h, #7
+        sqrshrun        v13.8b, v10.8h, #7
+        sqrshrun2       v13.16b, v11.8h, #7
+        st1            {v12.16b, v13.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v18.16b, v19.16b}, [x8], x3
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v20, v22, v24, v26, v28, v30, v16, v18
+        calc_qpelb2     v9,  v20, v22, v24, v26, v28, v30, v16, v18
+        calc_qpelb      v10, v21, v23, v25, v27, v29, v31, v17, v19
+        calc_qpelb2     v11, v21, v23, v25, v27, v29, v31, v17, v19
+        ld1            {v12.8h, v13.8h, v14.8h, v15.8h}, [x9], x12   // src2
+        sqadd           v8.8h, v8.8h, v12.8h
+        sqadd           v9.8h, v9.8h, v13.8h
+        sqadd           v10.8h, v10.8h, v14.8h
+        sqadd           v11.8h, v11.8h, v15.8h
+        sqrshrun        v12.8b, v8.8h, #7
+        sqrshrun2       v12.16b, v9.8h, #7
+        sqrshrun        v13.8b, v10.8h, #7
+        sqrshrun2       v13.16b, v11.8h, #7
+        st1            {v12.16b, v13.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v20.16b, v21.16b}, [x8], x3
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v22, v24, v26, v28, v30, v16, v18, v20
+        calc_qpelb2     v9,  v22, v24, v26, v28, v30, v16, v18, v20
+        calc_qpelb      v10, v23, v25, v27, v29, v31, v17, v19, v21
+        calc_qpelb2     v11, v23, v25, v27, v29, v31, v17, v19, v21
+        ld1            {v12.8h, v13.8h, v14.8h, v15.8h}, [x9], x12   // src2
+        sqadd           v8.8h, v8.8h, v12.8h
+        sqadd           v9.8h, v9.8h, v13.8h
+        sqadd           v10.8h, v10.8h, v14.8h
+        sqadd           v11.8h, v11.8h, v15.8h
+        sqrshrun        v12.8b, v8.8h, #7
+        sqrshrun2       v12.16b, v9.8h, #7
+        sqrshrun        v13.8b, v10.8h, #7
+        sqrshrun2       v13.16b, v11.8h, #7
+        st1            {v12.16b, v13.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v22.16b, v23.16b}, [x8], x3
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v24, v26, v28, v30, v16, v18, v20, v22
+        calc_qpelb2     v9,  v24, v26, v28, v30, v16, v18, v20, v22
+        calc_qpelb      v10, v25, v27, v29, v31, v17, v19, v21, v23
+        calc_qpelb2     v11, v25, v27, v29, v31, v17, v19, v21, v23
+        ld1            {v12.8h, v13.8h, v14.8h, v15.8h}, [x9], x12   // src2
+        sqadd           v8.8h, v8.8h, v12.8h
+        sqadd           v9.8h, v9.8h, v13.8h
+        sqadd           v10.8h, v10.8h, v14.8h
+        sqadd           v11.8h, v11.8h, v15.8h
+        sqrshrun        v12.8b, v8.8h, #7
+        sqrshrun2       v12.16b, v9.8h, #7
+        sqrshrun        v13.8b, v10.8h, #7
+        sqrshrun2       v13.16b, v11.8h, #7
+        st1            {v12.16b, v13.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v24.16b, v25.16b}, [x8], x3
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v26, v28, v30, v16, v18, v20, v22, v24
+        calc_qpelb2     v9,  v26, v28, v30, v16, v18, v20, v22, v24
+        calc_qpelb      v10, v27, v29, v31, v17, v19, v21, v23, v25
+        calc_qpelb2     v11, v27, v29, v31, v17, v19, v21, v23, v25
+        ld1            {v12.8h, v13.8h, v14.8h, v15.8h}, [x9], x12   // src2
+        sqadd           v8.8h, v8.8h, v12.8h
+        sqadd           v9.8h, v9.8h, v13.8h
+        sqadd           v10.8h, v10.8h, v14.8h
+        sqadd           v11.8h, v11.8h, v15.8h
+        sqrshrun        v12.8b, v8.8h, #7
+        sqrshrun2       v12.16b, v9.8h, #7
+        sqrshrun        v13.8b, v10.8h, #7
+        sqrshrun2       v13.16b, v11.8h, #7
+        st1            {v12.16b, v13.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v26.16b, v27.16b}, [x8], x3
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v28, v30, v16, v18, v20, v22, v24, v26
+        calc_qpelb2     v9,  v28, v30, v16, v18, v20, v22, v24, v26
+        calc_qpelb      v10, v29, v31, v17, v19, v21, v23, v25, v27
+        calc_qpelb2     v11, v29, v31, v17, v19, v21, v23, v25, v27
+        ld1            {v12.8h, v13.8h, v14.8h, v15.8h}, [x9], x12   // src2
+        sqadd           v8.8h, v8.8h, v12.8h
+        sqadd           v9.8h, v9.8h, v13.8h
+        sqadd           v10.8h, v10.8h, v14.8h
+        sqadd           v11.8h, v11.8h, v15.8h
+        sqrshrun        v12.8b, v8.8h, #7
+        sqrshrun2       v12.16b, v9.8h, #7
+        sqrshrun        v13.8b, v10.8h, #7
+        sqrshrun2       v13.16b, v11.8h, #7
+        st1            {v12.16b, v13.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v28.16b, v29.16b}, [x8], x3
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v30, v16, v18, v20, v22, v24, v26, v28
+        calc_qpelb2     v9,  v30, v16, v18, v20, v22, v24, v26, v28
+        calc_qpelb      v10, v31, v17, v19, v21, v23, v25, v27, v29
+        calc_qpelb2     v11, v31, v17, v19, v21, v23, v25, v27, v29
+        ld1            {v12.8h, v13.8h, v14.8h, v15.8h}, [x9], x12   // src2
+        sqadd           v8.8h, v8.8h, v12.8h
+        sqadd           v9.8h, v9.8h, v13.8h
+        sqadd           v10.8h, v10.8h, v14.8h
+        sqadd           v11.8h, v11.8h, v15.8h
+        sqrshrun        v12.8b, v8.8h, #7
+        sqrshrun2       v12.16b, v9.8h, #7
+        sqrshrun        v13.8b, v10.8h, #7
+        sqrshrun2       v13.16b, v11.8h, #7
+        st1            {v12.16b, v13.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.hi            2b
+
+3:      add             x0, x0, #32          // dst
+        add             x2, x2, #32          // src
+        add             x4, x4, #64          // src2
+        subs            x6, x6, #32
+        b.ne            1b
+        ld1            {v8.16b, v9.16b, v10.16b, v11.16b}, [sp], #64
+        ld1            {v12.16b, v13.16b, v14.16b, v15.16b}, [sp], #64
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v48_8_neon, export=1
+        stp             x7, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        mov             x8, #32
+        stp             x8, x8, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_qpel_bi_v32_8_neon)
+        ldp             x8, xzr, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        ldr             x7, [sp]
+        add             x0, x0, #32
+        add             x2, x2, #32
+        add             x4, x4, #64
+        bl              X(ff_hevc_put_hevc_qpel_bi_v16_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v64_8_neon, export=1
+        b X(ff_hevc_put_hevc_qpel_bi_v32_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv4_8_neon, export=1
+        add             x10, x5, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x5, #7
+        mov             x4, x6
+        bl              X(ff_hevc_put_hevc_qpel_h4_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_qpel_filterh x7, x6
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ld1            {v16.4h}, [sp], x9
+        ld1            {v17.4h}, [sp], x9
+        ld1            {v18.4h}, [sp], x9
+        ld1            {v19.4h}, [sp], x9
+        ld1            {v20.4h}, [sp], x9
+        ld1            {v21.4h}, [sp], x9
+        ld1            {v22.4h}, [sp], x9
+1:      ld1            {v23.4h}, [sp], x9
+        calc_qpelh      v1, v16, v17, v18, v19, v20, v21, v22, v23, sshr
+        ld1            {v5.4h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        rshrn           v1.4h, v1.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x9
+        calc_qpelh      v1, v17, v18, v19, v20, v21, v22, v23, v16, sshr
+        ld1            {v5.4h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        rshrn           v1.4h, v1.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x9
+        calc_qpelh      v1, v18, v19, v20, v21, v22, v23, v16, v17, sshr
+        ld1            {v5.4h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        rshrn           v1.4h, v1.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x9
+        calc_qpelh      v1, v19, v20, v21, v22, v23, v16, v17, v18, sshr
+        ld1            {v5.4h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        rshrn           v1.4h, v1.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v19.8h}, [sp], x9
+        calc_qpelh      v1, v20, v21, v22, v23, v16, v17, v18, v19, sshr
+        ld1            {v5.4h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        rshrn           v1.4h, v1.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v20.8h}, [sp], x9
+        calc_qpelh      v1, v21, v22, v23, v16, v17, v18, v19, v20, sshr
+        ld1            {v5.4h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        rshrn           v1.4h, v1.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v21.8h}, [sp], x9
+        calc_qpelh      v1, v22, v23, v16, v17, v18, v19, v20, v21, sshr
+        ld1            {v5.4h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        rshrn           v1.4h, v1.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v22.8h}, [sp], x9
+        calc_qpelh      v1, v23, v16, v17, v18, v19, v20, v21, v22, sshr
+        ld1            {v5.4h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        rshrn           v1.4h, v1.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv6_8_neon, export=1
+        add             x10, x5, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x5, #7
+        mov             x4, x6
+        bl              X(ff_hevc_put_hevc_qpel_h6_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_qpel_filterh x7, x6
+        sub             x1, x1, #4
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h}, [sp], x9
+        ld1            {v17.8h}, [sp], x9
+        ld1            {v18.8h}, [sp], x9
+        ld1            {v19.8h}, [sp], x9
+        ld1            {v20.8h}, [sp], x9
+        ld1            {v21.8h}, [sp], x9
+        ld1            {v22.8h}, [sp], x9
+1:      ld1            {v23.8h}, [sp], x9
+        calc_qpelh      v1, v16, v17, v18, v19, v20, v21, v22, v23, sshr
+        calc_qpelh2     v2, v2, v16, v17, v18, v19, v20, v21, v22, v23, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x9
+        calc_qpelh      v1, v17, v18, v19, v20, v21, v22, v23, v16, sshr
+        calc_qpelh2     v2, v2, v17, v18, v19, v20, v21, v22, v23, v16, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x9
+        calc_qpelh      v1, v18, v19, v20, v21, v22, v23, v16, v17, sshr
+        calc_qpelh2     v2, v2, v18, v19, v20, v21, v22, v23, v16, v17, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x9
+        calc_qpelh      v1, v19, v20, v21, v22, v23, v16, v17, v18, sshr
+        calc_qpelh2     v2, v2, v19, v20, v21, v22, v23, v16, v17, v18, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v19.8h}, [sp], x9
+        calc_qpelh      v1, v20, v21, v22, v23, v16, v17, v18, v19, sshr
+        calc_qpelh2     v2, v2, v20, v21, v22, v23, v16, v17, v18, v19, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v20.8h}, [sp], x9
+        calc_qpelh      v1, v21, v22, v23, v16, v17, v18, v19, v20, sshr
+        calc_qpelh2     v2, v2, v21, v22, v23, v16, v17, v18, v19, v20, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v21.8h}, [sp], x9
+        calc_qpelh      v1, v22, v23, v16, v17, v18, v19, v20, v21, sshr
+        calc_qpelh2     v2, v2, v22, v23, v16, v17, v18, v19, v20, v21, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v22.8h}, [sp], x9
+        calc_qpelh      v1, v23, v16, v17, v18, v19, v20, v21, v22, sshr
+        calc_qpelh2     v2, v2, v23, v16, v17, v18, v19, v20, v21, v22, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv8_8_neon, export=1
+        add             x10, x5, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x5, #7
+        mov             x4, x6
+        bl              X(ff_hevc_put_hevc_qpel_h8_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_qpel_filterh x7, x6
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h}, [sp], x9
+        ld1            {v17.8h}, [sp], x9
+        ld1            {v18.8h}, [sp], x9
+        ld1            {v19.8h}, [sp], x9
+        ld1            {v20.8h}, [sp], x9
+        ld1            {v21.8h}, [sp], x9
+        ld1            {v22.8h}, [sp], x9
+1:      ld1            {v23.8h}, [sp], x9
+        calc_qpelh      v1, v16, v17, v18, v19, v20, v21, v22, v23, sshr
+        calc_qpelh2     v2, v2, v16, v17, v18, v19, v20, v21, v22, v23, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x9
+        calc_qpelh      v1, v17, v18, v19, v20, v21, v22, v23, v16, sshr
+        calc_qpelh2     v2, v2, v17, v18, v19, v20, v21, v22, v23, v16, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x9
+        calc_qpelh      v1, v18, v19, v20, v21, v22, v23, v16, v17, sshr
+        calc_qpelh2     v2, v2, v18, v19, v20, v21, v22, v23, v16, v17, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x9
+        calc_qpelh      v1, v19, v20, v21, v22, v23, v16, v17, v18, sshr
+        calc_qpelh2     v2, v2, v19, v20, v21, v22, v23, v16, v17, v18, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v19.8h}, [sp], x9
+        calc_qpelh      v1, v20, v21, v22, v23, v16, v17, v18, v19, sshr
+        calc_qpelh2     v2, v2, v20, v21, v22, v23, v16, v17, v18, v19, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v20.8h}, [sp], x9
+        calc_qpelh      v1, v21, v22, v23, v16, v17, v18, v19, v20, sshr
+        calc_qpelh2     v2, v2, v21, v22, v23, v16, v17, v18, v19, v20, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v21.8h}, [sp], x9
+        calc_qpelh      v1, v22, v23, v16, v17, v18, v19, v20, v21, sshr
+        calc_qpelh2     v2, v2, v22, v23, v16, v17, v18, v19, v20, v21, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v22.8h}, [sp], x9
+        calc_qpelh      v1, v23, v16, v17, v18, v19, v20, v21, v22, sshr
+        calc_qpelh2     v2, v2, v23, v16, v17, v18, v19, v20, v21, v22, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv12_8_neon, export=1
+        stp             xzr, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x6, x7, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_qpel_bi_hv8_8_neon)
+        ldp             x6, x7, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        add             x0, x0, #8
+        add             x2, x2, #8
+        add             x4, x4, #16
+        bl              X(ff_hevc_put_hevc_qpel_bi_hv4_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv16_8_neon, export=1
+        add             x10, x5, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x5, #7
+        mov             x4, x6
+        bl              X(ff_hevc_put_hevc_qpel_h16_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        mov             x6, #16          // width
+.Lqpel_bi_hv16_loop:
+        load_qpel_filterh x7, x8
+        mov             x9, #(MAX_PB_SIZE * 2)
+        mov             x10, x6
+
+1:      mov             x11, x5         // height
+        mov             x7, x0          // dst
+        mov             x8, sp          // src
+        mov             x12, x4         // src2
+
+        ld1            {v16.8h, v17.8h}, [x8], x9
+        ld1            {v18.8h, v19.8h}, [x8], x9
+        ld1            {v20.8h, v21.8h}, [x8], x9
+        ld1            {v22.8h, v23.8h}, [x8], x9
+        ld1            {v24.8h, v25.8h}, [x8], x9
+        ld1            {v26.8h, v27.8h}, [x8], x9
+        ld1            {v28.8h, v29.8h}, [x8], x9
+2:      ld1            {v30.8h, v31.8h}, [x8], x9
+        calc_qpelh      v1, v16, v18, v20, v22, v24, v26, v28, v30, sshr
+        calc_qpelh2     v2, v2, v16, v18, v20, v22, v24, v26, v28, v30, sshr
+        calc_qpelh      v3, v17, v19, v21, v23, v25, v27, v29, v31, sshr
+        calc_qpelh2     v4, v4, v17, v19, v21, v23, v25, v27, v29, v31, sshr
+        ld1            {v5.8h, v6.8h}, [x12], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        saddw           v3.4s, v3.4s, v6.4h
+        saddw2          v4.4s, v4.4s, v6.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        rshrn           v2.4h, v3.4s, #7
+        rshrn2          v2.8h, v4.4s, #7
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x7], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v16.8h, v17.8h}, [x8], x9
+        calc_qpelh      v1, v18, v20, v22, v24, v26, v28, v30, v16, sshr
+        calc_qpelh2     v2, v2, v18, v20, v22, v24, v26, v28, v30, v16, sshr
+        calc_qpelh      v3, v19, v21, v23, v25, v27, v29, v31, v17, sshr
+        calc_qpelh2     v4, v4, v19, v21, v23, v25, v27, v29, v31, v17, sshr
+        ld1            {v5.8h, v6.8h}, [x12], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        saddw           v3.4s, v3.4s, v6.4h
+        saddw2          v4.4s, v4.4s, v6.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        rshrn           v2.4h, v3.4s, #7
+        rshrn2          v2.8h, v4.4s, #7
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x7], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v18.8h, v19.8h}, [x8], x9
+        calc_qpelh      v1, v20, v22, v24, v26, v28, v30, v16, v18, sshr
+        calc_qpelh2     v2, v2, v20, v22, v24, v26, v28, v30, v16, v18, sshr
+        calc_qpelh      v3, v21, v23, v25, v27, v29, v31, v17, v19, sshr
+        calc_qpelh2     v4, v4, v21, v23, v25, v27, v29, v31, v17, v19, sshr
+        ld1            {v5.8h, v6.8h}, [x12], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        saddw           v3.4s, v3.4s, v6.4h
+        saddw2          v4.4s, v4.4s, v6.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        rshrn           v2.4h, v3.4s, #7
+        rshrn2          v2.8h, v4.4s, #7
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x7], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v20.8h, v21.8h}, [x8], x9
+        calc_qpelh      v1, v22, v24, v26, v28, v30, v16, v18, v20, sshr
+        calc_qpelh2     v2, v2, v22, v24, v26, v28, v30, v16, v18, v20, sshr
+        calc_qpelh      v3, v23, v25, v27, v29, v31, v17, v19, v21, sshr
+        calc_qpelh2     v4, v4, v23, v25, v27, v29, v31, v17, v19, v21, sshr
+        ld1            {v5.8h, v6.8h}, [x12], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        saddw           v3.4s, v3.4s, v6.4h
+        saddw2          v4.4s, v4.4s, v6.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        rshrn           v2.4h, v3.4s, #7
+        rshrn2          v2.8h, v4.4s, #7
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x7], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v22.8h, v23.8h}, [x8], x9
+        calc_qpelh      v1, v24, v26, v28, v30, v16, v18, v20, v22, sshr
+        calc_qpelh2     v2, v2, v24, v26, v28, v30, v16, v18, v20, v22, sshr
+        calc_qpelh      v3, v25, v27, v29, v31, v17, v19, v21, v23, sshr
+        calc_qpelh2     v4, v4, v25, v27, v29, v31, v17, v19, v21, v23, sshr
+        ld1            {v5.8h, v6.8h}, [x12], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        saddw           v3.4s, v3.4s, v6.4h
+        saddw2          v4.4s, v4.4s, v6.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        rshrn           v2.4h, v3.4s, #7
+        rshrn2          v2.8h, v4.4s, #7
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x7], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v24.8h, v25.8h}, [x8], x9
+        calc_qpelh      v1, v26, v28, v30, v16, v18, v20, v22, v24, sshr
+        calc_qpelh2     v2, v2, v26, v28, v30, v16, v18, v20, v22, v24, sshr
+        calc_qpelh      v3, v27, v29, v31, v17, v19, v21, v23, v25, sshr
+        calc_qpelh2     v4, v4, v27, v29, v31, v17, v19, v21, v23, v25, sshr
+        ld1            {v5.8h, v6.8h}, [x12], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        saddw           v3.4s, v3.4s, v6.4h
+        saddw2          v4.4s, v4.4s, v6.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        rshrn           v2.4h, v3.4s, #7
+        rshrn2          v2.8h, v4.4s, #7
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x7], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v26.8h, v27.8h}, [x8], x9
+        calc_qpelh      v1, v28, v30, v16, v18, v20, v22, v24, v26, sshr
+        calc_qpelh2     v2, v2, v28, v30, v16, v18, v20, v22, v24, v26, sshr
+        calc_qpelh      v3, v29, v31, v17, v19, v21, v23, v25, v27, sshr
+        calc_qpelh2     v4, v4, v29, v31, v17, v19, v21, v23, v25, v27, sshr
+        ld1            {v5.8h, v6.8h}, [x12], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        saddw           v3.4s, v3.4s, v6.4h
+        saddw2          v4.4s, v4.4s, v6.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        rshrn           v2.4h, v3.4s, #7
+        rshrn2          v2.8h, v4.4s, #7
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x7], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v28.8h, v29.8h}, [x8], x9
+        calc_qpelh      v1, v30, v16, v18, v20, v22, v24, v26, v28, sshr
+        calc_qpelh2     v2, v2, v30, v16, v18, v20, v22, v24, v26, v28, sshr
+        calc_qpelh      v3, v31, v17, v19, v21, v23, v25, v27, v29, sshr
+        calc_qpelh2     v4, v4, v31, v17, v19, v21, v23, v25, v27, v29, sshr
+        ld1            {v5.8h, v6.8h}, [x12], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        saddw           v3.4s, v3.4s, v6.4h
+        saddw2          v4.4s, v4.4s, v6.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        rshrn           v2.4h, v3.4s, #7
+        rshrn2          v2.8h, v4.4s, #7
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x7], x1
+        subs            x11, x11, #1
+        b.ne            2b
+
+3:      add             x0, x0, #16
+        add             sp, sp, #32
+        add             x4, x4, #32
+        subs            x10, x10, #16
+        b.ne            1b
+        add             x10, x5, #7
+        lsl             x10, x10, #7
+        sub             x10, x10, x6, lsl #1 // part of first line
+        add             sp, sp, x10         // tmp_array without first line
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv24_8_neon, export=1
+        stp             xzr, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x6, x7, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_qpel_bi_hv16_8_neon)
+        ldp             x6, x7, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        add             x0, x0, #16
+        add             x2, x2, #16
+        add             x4, x4, #32
+        bl              X(ff_hevc_put_hevc_qpel_bi_hv8_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv32_8_neon, export=1
+        add             x10, x5, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x5, #7
+        mov             x4, x6
+        bl              X(ff_hevc_put_hevc_qpel_h32_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        mov             x6, #32 // width
+        b               .Lqpel_bi_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv48_8_neon, export=1
+        add             x10, x5, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10 // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x5, #7
+        mov             x4, x6
+        bl              X(ff_hevc_put_hevc_qpel_h48_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        mov             x6, #48 // width
+        b               .Lqpel_bi_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv64_8_neon, export=1
+        add             x10, x5, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10 // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x5, #7
+        mov             x4, x6
+        bl              X(ff_hevc_put_hevc_qpel_h64_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        mov             x6, #64          // width
+        b               .Lqpel_bi_hv16_loop
+endfunc