diff mbox

[FFmpeg-devel] 8-bit hevc decoding optimization on aarch64 with neon

Message ID 20171118173548.12393-1-fatwildcat@gmail.com
State New
Headers show

Commit Message

Rafal Dabrowa Nov. 18, 2017, 5:35 p.m. UTC
This is a proposal of performance optimizations for 8-bit
hevc video decoding on aarch64 platform with neon (simd) extension.

I'm testing my optimizations on NanoPi M3 device. I'm using
mainly "Big Buck Bunny" video file in format 1280x720 for testing.
The video file was pulled from libde265.org page, see
http://www.libde265.org/hevc-bitstreams/bbb-1280x720-cfg06.mkv
The movie duration is 00:10:34.53.

Overall performance gain is about 2x. Without optimizations the movie
playback stops in practice after a few seconds. With
optimizations the file is played smoothly 99% of the time.

For performance testing the following command was used:

    time ./ffmpeg -hide_banner -i ~/bbb-1280x720-cfg06.mkv -f yuv4mpegpipe - >/dev/null

The video file was pre-read before test to minimize disk reads during testing.
Program execution time without optimization was as follows:

real	11m48.576s
user	43m8.111s
sys	0m12.469s

Execution time with optimizations:

real	6m17.046s
user	21m19.792s
sys	0m14.724s


The patch contains optimizations for most heavily used qpel, epel, sao and idct
functions.  Among the functions provided for optimization there are two
intensively used, but not optimized in this patch: hevc_v_loop_filter_luma_8
and hevc_h_loop_filter_luma_8. I have no idea how they could be optimized
hence I leaved them without optimizations.



Signed-off-by: Rafal Dabrowa <fatwildcat@gmail.com>
---
 libavcodec/aarch64/Makefile               |    5 +
 libavcodec/aarch64/hevcdsp_epel_8.S       | 3949 ++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_idct_8.S       | 1980 ++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  170 +
 libavcodec/aarch64/hevcdsp_qpel_8.S       | 5666 +++++++++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_sao_8.S        |  166 +
 libavcodec/hevcdsp.c                      |    2 +
 libavcodec/hevcdsp.h                      |    1 +
 8 files changed, 11939 insertions(+)
 create mode 100644 libavcodec/aarch64/hevcdsp_epel_8.S
 create mode 100644 libavcodec/aarch64/hevcdsp_idct_8.S
 create mode 100644 libavcodec/aarch64/hevcdsp_init_aarch64.c
 create mode 100644 libavcodec/aarch64/hevcdsp_qpel_8.S
 create mode 100644 libavcodec/aarch64/hevcdsp_sao_8.S

Comments

Carl Eugen Hoyos Nov. 18, 2017, 5:50 p.m. UTC | #1
2017-11-18 18:35 GMT+01:00 Rafal Dabrowa <fatwildcat@gmail.com>:

> For performance testing the following command was used:
>
>     time ./ffmpeg -hide_banner -i ~/bbb-1280x720-cfg06.mkv -f yuv4mpegpipe - >/dev/null

An alternative is:
./ffmpeg -benchmark -i ~/bbb-1280x720-cfg06.mkv -f null -

> The video file was pre-read before test to minimize disk reads during testing.
> Program execution time without optimization was as follows:
>
> real    11m48.576s
> user    43m8.111s
> sys     0m12.469s
>
> Execution time with optimizations:
>
> real    6m17.046s
> user    21m19.792s
> sys     0m14.724s

Looks impressive.


> +av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
> +{
> +    int cpu_flags = av_get_cpu_flags();
> +
> +    if (have_neon(cpu_flags) && bit_depth == 8) {
> +        NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels);
> +        NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h);
> +        NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v);
> +        NEON8_FNASSIGN(c->put_hevc_epel, 1, 1, epel_hv);
> +        NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v);
> +        NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 1, epel_uni_hv);
> +        NEON8_FNASSIGN(c->put_hevc_epel_bi, 0, 0, pel_bi_pixels);
> +        NEON8_FNASSIGN(c->put_hevc_epel_bi, 0, 1, epel_bi_h);
> +        NEON8_FNASSIGN(c->put_hevc_epel_bi, 1, 0, epel_bi_v);
> +        NEON8_FNASSIGN(c->put_hevc_epel_bi, 1, 1, epel_bi_hv);
> +        NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels);
> +        NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h);
> +        NEON8_FNASSIGN(c->put_hevc_qpel, 1, 0, qpel_v);
> +        NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv);
> +        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 1, qpel_uni_h);
> +        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 0, qpel_uni_v);
> +        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv);
> +        NEON8_FNASSIGN(c->put_hevc_qpel_bi, 0, 0, pel_bi_pixels);
> +        NEON8_FNASSIGN(c->put_hevc_qpel_bi, 0, 1, qpel_bi_h);
> +        NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 0, qpel_bi_v);
> +        NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 1, qpel_bi_hv);

I wonder if it would have made sense to test and send that patches
in smaller portions, so that those with possible improvements
can be identified.

Thank you, Carl Eugen
Rostislav Pehlivanov Nov. 18, 2017, 6:31 p.m. UTC | #2
>
>
>
> On 18 November 2017 at 17:35, Rafal Dabrowa <fatwildcat@gmail.com> wrote:
>
> This is a proposal of performance optimizations for 8-bit
> hevc video decoding on aarch64 platform with neon (simd) extension.
>
> I'm testing my optimizations on NanoPi M3 device. I'm using
> mainly "Big Buck Bunny" video file in format 1280x720 for testing.
> The video file was pulled from libde265.org page, see
> http://www.libde265.org/hevc-bitstreams/bbb-1280x720-cfg06.mkv
> The movie duration is 00:10:34.53.
>
> Overall performance gain is about 2x. Without optimizations the movie
> playback stops in practice after a few seconds. With
> optimizations the file is played smoothly 99% of the time.
>
> For performance testing the following command was used:
>
>     time ./ffmpeg -hide_banner -i ~/bbb-1280x720-cfg06.mkv -f yuv4mpegpipe
> - >/dev/null
>
> The video file was pre-read before test to minimize disk reads during
> testing.
> Program execution time without optimization was as follows:
>
> real    11m48.576s
> user    43m8.111s
> sys     0m12.469s
>
> Execution time with optimizations:
>
> real    6m17.046s
> user    21m19.792s
> sys     0m14.724s
>
>
> The patch contains optimizations for most heavily used qpel, epel, sao and
> idct
> functions.  Among the functions provided for optimization there are two
> intensively used, but not optimized in this patch:
> hevc_v_loop_filter_luma_8
> and hevc_h_loop_filter_luma_8. I have no idea how they could be optimized
> hence I leaved them without optimizations.
>
>
>
> Signed-off-by: Rafal Dabrowa <fatwildcat@gmail.com>
> ---
>  libavcodec/aarch64/Makefile               |    5 +
>  libavcodec/aarch64/hevcdsp_epel_8.S       | 3949 ++++++++++++++++++++
>  libavcodec/aarch64/hevcdsp_idct_8.S       | 1980 ++++++++++
>  libavcodec/aarch64/hevcdsp_init_aarch64.c |  170 +
>  libavcodec/aarch64/hevcdsp_qpel_8.S       | 5666
> +++++++++++++++++++++++++++++
>  libavcodec/aarch64/hevcdsp_sao_8.S        |  166 +
>  libavcodec/hevcdsp.c                      |    2 +
>  libavcodec/hevcdsp.h                      |    1 +
>  8 files changed, 11939 insertions(+)
>  create mode 100644 libavcodec/aarch64/hevcdsp_epel_8.S
>  create mode 100644 libavcodec/aarch64/hevcdsp_idct_8.S
>  create mode 100644 libavcodec/aarch64/hevcdsp_init_aarch64.c
>  create mode 100644 libavcodec/aarch64/hevcdsp_qpel_8.S
>  create mode 100644 libavcodec/aarch64/hevcdsp_sao_8.S



Very nice.
The way we test SIMD is to put START_TIMER("function_name"); and
STOP_TIMER; (they're located in libavutil/timer.h) around where the
function gets called in the C code, then we do a run with the C code (no
SIMD) and a separate run with whatever SIMD optimizations we're
implementing. We take the last printed value of both runs and that's what's
used to measure speedup.

I don't think there's a need to split the patch into multiple patches for
each idividual version though yet, that's usually only done if some
function's C implementation is faster than the SIMD code.
James Almer Nov. 18, 2017, 6:41 p.m. UTC | #3
On 11/18/2017 3:31 PM, Rostislav Pehlivanov wrote:
>>
>>
>>
>> On 18 November 2017 at 17:35, Rafal Dabrowa <fatwildcat@gmail.com> wrote:
>>
>> This is a proposal of performance optimizations for 8-bit
>> hevc video decoding on aarch64 platform with neon (simd) extension.
>>
>> I'm testing my optimizations on NanoPi M3 device. I'm using
>> mainly "Big Buck Bunny" video file in format 1280x720 for testing.
>> The video file was pulled from libde265.org page, see
>> http://www.libde265.org/hevc-bitstreams/bbb-1280x720-cfg06.mkv
>> The movie duration is 00:10:34.53.
>>
>> Overall performance gain is about 2x. Without optimizations the movie
>> playback stops in practice after a few seconds. With
>> optimizations the file is played smoothly 99% of the time.
>>
>> For performance testing the following command was used:
>>
>>     time ./ffmpeg -hide_banner -i ~/bbb-1280x720-cfg06.mkv -f yuv4mpegpipe
>> - >/dev/null
>>
>> The video file was pre-read before test to minimize disk reads during
>> testing.
>> Program execution time without optimization was as follows:
>>
>> real    11m48.576s
>> user    43m8.111s
>> sys     0m12.469s
>>
>> Execution time with optimizations:
>>
>> real    6m17.046s
>> user    21m19.792s
>> sys     0m14.724s
>>
>>
>> The patch contains optimizations for most heavily used qpel, epel, sao and
>> idct
>> functions.  Among the functions provided for optimization there are two
>> intensively used, but not optimized in this patch:
>> hevc_v_loop_filter_luma_8
>> and hevc_h_loop_filter_luma_8. I have no idea how they could be optimized
>> hence I leaved them without optimizations.
>>
>>
>>
>> Signed-off-by: Rafal Dabrowa <fatwildcat@gmail.com>
>> ---
>>  libavcodec/aarch64/Makefile               |    5 +
>>  libavcodec/aarch64/hevcdsp_epel_8.S       | 3949 ++++++++++++++++++++
>>  libavcodec/aarch64/hevcdsp_idct_8.S       | 1980 ++++++++++
>>  libavcodec/aarch64/hevcdsp_init_aarch64.c |  170 +
>>  libavcodec/aarch64/hevcdsp_qpel_8.S       | 5666
>> +++++++++++++++++++++++++++++
>>  libavcodec/aarch64/hevcdsp_sao_8.S        |  166 +
>>  libavcodec/hevcdsp.c                      |    2 +
>>  libavcodec/hevcdsp.h                      |    1 +
>>  8 files changed, 11939 insertions(+)
>>  create mode 100644 libavcodec/aarch64/hevcdsp_epel_8.S
>>  create mode 100644 libavcodec/aarch64/hevcdsp_idct_8.S
>>  create mode 100644 libavcodec/aarch64/hevcdsp_init_aarch64.c
>>  create mode 100644 libavcodec/aarch64/hevcdsp_qpel_8.S
>>  create mode 100644 libavcodec/aarch64/hevcdsp_sao_8.S
> 
> 
> 
> Very nice.
> The way we test SIMD is to put START_TIMER("function_name"); and
> STOP_TIMER; (they're located in libavutil/timer.h) around where the
> function gets called in the C code, then we do a run with the C code (no
> SIMD) and a separate run with whatever SIMD optimizations we're
> implementing. We take the last printed value of both runs and that's what's
> used to measure speedup.
> 
> I don't think there's a need to split the patch into multiple patches for
> each idividual version though yet, that's usually only done if some
> function's C implementation is faster than the SIMD code.

It would be nice however to at least split it into two patches, one for
MC and one for SAO.

Also, no way to use macros in aarch64 asm files? ~11k lines of code is a
lot to add, and I'm sure a sizable portion is duplicated with only some
small differences between functions.
Rafal Dabrowa Nov. 19, 2017, 2:43 p.m. UTC | #4
On 11/18/2017 07:41 PM, James Almer wrote:
> On 11/18/2017 3:31 PM, Rostislav Pehlivanov wrote:
>>>
>>>
>>> On 18 November 2017 at 17:35, Rafal Dabrowa <fatwildcat@gmail.com> wrote:
>>>
>>> This is a proposal of performance optimizations for 8-bit
>>> hevc video decoding on aarch64 platform with neon (simd) extension.
>>>
>>> I'm testing my optimizations on NanoPi M3 device. I'm using
>>> mainly "Big Buck Bunny" video file in format 1280x720 for testing.
>>> The video file was pulled from libde265.org page, see
>>> http://www.libde265.org/hevc-bitstreams/bbb-1280x720-cfg06.mkv
>>> The movie duration is 00:10:34.53.
>>>
>>> Overall performance gain is about 2x. Without optimizations the movie
>>> playback stops in practice after a few seconds. With
>>> optimizations the file is played smoothly 99% of the time.
>>>
>>> For performance testing the following command was used:
>>>
>>>      time ./ffmpeg -hide_banner -i ~/bbb-1280x720-cfg06.mkv -f yuv4mpegpipe
>>> - >/dev/null
>>>
>>> The video file was pre-read before test to minimize disk reads during
>>> testing.
>>> Program execution time without optimization was as follows:
>>>
>>> real    11m48.576s
>>> user    43m8.111s
>>> sys     0m12.469s
>>>
>>> Execution time with optimizations:
>>>
>>> real    6m17.046s
>>> user    21m19.792s
>>> sys     0m14.724s
>>>
>>>
>>> The patch contains optimizations for most heavily used qpel, epel, sao and
>>> idct
>>> functions.  Among the functions provided for optimization there are two
>>> intensively used, but not optimized in this patch:
>>> hevc_v_loop_filter_luma_8
>>> and hevc_h_loop_filter_luma_8. I have no idea how they could be optimized
>>> hence I leaved them without optimizations.
>>>
>>>
>>>
>>> Signed-off-by: Rafal Dabrowa <fatwildcat@gmail.com>
>>> ---
>>>   libavcodec/aarch64/Makefile               |    5 +
>>>   libavcodec/aarch64/hevcdsp_epel_8.S       | 3949 ++++++++++++++++++++
>>>   libavcodec/aarch64/hevcdsp_idct_8.S       | 1980 ++++++++++
>>>   libavcodec/aarch64/hevcdsp_init_aarch64.c |  170 +
>>>   libavcodec/aarch64/hevcdsp_qpel_8.S       | 5666
>>> +++++++++++++++++++++++++++++
>>>   libavcodec/aarch64/hevcdsp_sao_8.S        |  166 +
>>>   libavcodec/hevcdsp.c                      |    2 +
>>>   libavcodec/hevcdsp.h                      |    1 +
>>>   8 files changed, 11939 insertions(+)
>>>   create mode 100644 libavcodec/aarch64/hevcdsp_epel_8.S
>>>   create mode 100644 libavcodec/aarch64/hevcdsp_idct_8.S
>>>   create mode 100644 libavcodec/aarch64/hevcdsp_init_aarch64.c
>>>   create mode 100644 libavcodec/aarch64/hevcdsp_qpel_8.S
>>>   create mode 100644 libavcodec/aarch64/hevcdsp_sao_8.S
>>
>>
>> Very nice.
>> The way we test SIMD is to put START_TIMER("function_name"); and
>> STOP_TIMER; (they're located in libavutil/timer.h) around where the
>> function gets called in the C code, then we do a run with the C code (no
>> SIMD) and a separate run with whatever SIMD optimizations we're
>> implementing. We take the last printed value of both runs and that's what's
>> used to measure speedup.
>>
>> I don't think there's a need to split the patch into multiple patches for
>> each idividual version though yet, that's usually only done if some
>> function's C implementation is faster than the SIMD code.
> It would be nice however to at least split it into two patches, one for
> MC and one for SAO.
Could you explain whose functions are MC?

I can split patch into a few, but dependency between patches
is unavoidable because the non-optimized function pointers are
replaced with optimized all together, in one function body.
One of the patches must add the function and must add the function call.
>
> Also, no way to use macros in aarch64 asm files? ~11k lines of code is a
> lot to add, and I'm sure a sizable portion is duplicated with only some
> small differences between functions.
I used macros sparingly because code without macros is
easier to understand and to improve. Sometimes even order
of assembly instructions is important. But, of course, I can reduce
the code size using macros if the patch will be accepted. I didn't know
whether you are interested with the patch at all.


Regarding performance testing. I wrapped every function with another
one, which calls START_TIMER and STOP_TIMER. It looks these macros
aren't reentrant, I needed to force the program to run in single thread.
Without this I had strange results, very differing between runs, for 
example:

22190 UNITS in put_hevc_qpel_uni_h12_8,   16232 runs,    152 skips
1126 UNITS in put_hevc_qpel_uni_h12_8,   12001 runs,   4383 skips

Force to run in single-threaded mode was not easy, the -filter_threads
option didn't help.

Below is the outcome. Meaning of the columns:

FUNCTION - the function to optimize
UNITS_NOOPT - last UNITS result in run without optimization
OPT - last UNITS result in run with optimization
CALLS - sum of runs and skips
NSKIPS - number of skips in non-optimized version
OSKIPS - number of skips in optimized version


FUNCTION                 UNITS_NOOPT      OPT     CALLS   NSKIPS OSKIPS
-------------------------------------------------------------------------
idct_16x16_8                  113074    24079   2097152 0        0
idct_32x32_8                  587447   100434    524288 0        0
put_hevc_epel_bi_h4_8           7651     3654    524288      177 1857
put_hevc_epel_bi_h6_8          18377     6668     32768 0        0
put_hevc_epel_bi_h8_8          20644     6698   1048576       34 1298
put_hevc_epel_bi_h12_8         62927    18968     16384 0        0
put_hevc_epel_bi_h16_8         78601    21254    524288 0        4
put_hevc_epel_bi_h24_8        231004    53800      4096 0        0
put_hevc_epel_bi_h32_8        294058    63302    524288 0        0
put_hevc_epel_bi_hv4_8         13183     6264   2097152       67 3057
put_hevc_epel_bi_hv6_8         27672    12706    131072 0        0
put_hevc_epel_bi_hv8_8         31908    11184   2097152        4 1688
put_hevc_epel_bi_hv12_8        86370    29497     65536 0        0
put_hevc_epel_bi_hv16_8       104623    30717   1048576 0        3
put_hevc_epel_bi_hv24_8       302361    80610      8192 0        0
put_hevc_epel_bi_hv32_8       376614    92475   1048576 0        0
put_hevc_epel_bi_v4_8           7290     3368   2097152      338 4444
put_hevc_epel_bi_v6_8          19306     8423     65536 0        0
put_hevc_epel_bi_v8_8          20431     5795   2097152       12 2252
put_hevc_epel_bi_v12_8         61368    21050     16384 0        0
put_hevc_epel_bi_v16_8         74351    17655   1048576 0        9
put_hevc_epel_bi_v24_8        226914    51601      4096 0        0
put_hevc_epel_bi_v32_8        285476    55184   1048576 0        0
put_hevc_epel_h4_8              5826     3362    524288      667 2619
put_hevc_epel_h6_8             12852     5912     32768 0        0
put_hevc_epel_h8_8             13847     6009   1048576      237 1504
put_hevc_epel_h12_8            44210    17185     16384 0        0
put_hevc_epel_h16_8            53502    18642    524288 0        5
put_hevc_epel_h24_8           157030    48086      4096 0        0
put_hevc_epel_h32_8           193877    54837    524288 0        0
put_hevc_epel_hv4_8            11031     6379   2097152      316 1886
put_hevc_epel_hv6_8            23233    12730    131072 0        0
put_hevc_epel_hv8_8            25406    10989   2097152       21 1471
put_hevc_epel_hv12_8           70139    28821     65536 0        0
put_hevc_epel_hv16_8           81318    30190   1048576 0        4
put_hevc_epel_hv24_8          230829    75079     16384 0        0
put_hevc_epel_hv32_8          285945    92143   1048576 0        0
put_hevc_epel_uni_hv4_8        13255     7571   2097152 142      582
put_hevc_epel_uni_hv6_8        29279    14637    131072 0        0
put_hevc_epel_uni_hv8_8        31783    14114   1048576 0       26
put_hevc_epel_uni_hv12_8       85576    31757     32768 0        0
put_hevc_epel_uni_hv16_8       90346    29886    524288 0        0
put_hevc_epel_uni_hv24_8      281864    76862      1024 0        0
put_hevc_epel_uni_hv32_8      322135    91541     65536 0        0
put_hevc_epel_uni_v4_8          6826     3785   2097152      494 3496
put_hevc_epel_uni_v6_8         20113    10093     32768 0        0
put_hevc_epel_uni_v8_8         18883     6444   1048576 7      448
put_hevc_epel_uni_v12_8        59989    23523      8192 0        0
put_hevc_epel_uni_v16_8        63740    18096    262144 0        0
put_hevc_epel_uni_v24_8       208109    48880       512 0        0
put_hevc_epel_uni_v32_8       249717    50660    262144 0        0
put_hevc_epel_v4_8              5834     3056   2097152      970 5422
put_hevc_epel_v6_8             15541     8900     65536 0        0
put_hevc_epel_v8_8             14549     5476   2097152      296 3129
put_hevc_epel_v12_8            48518    22362     32768 0        0
put_hevc_epel_v16_8            53909    16483   1048576 0       23
put_hevc_epel_v24_8           166783    43662      4096 0        0
put_hevc_epel_v32_8           210650    47112   1048576 0        0
put_hevc_pel_bi_pixels4_8       4751     2923   2097152     7381 9232
put_hevc_pel_bi_pixels6_8      11774     5689     65536 0        0
put_hevc_pel_bi_pixels8_8      12269     4165   4194304     2298 12731
put_hevc_pel_bi_pixels12_8     36260    14031     65536 0        0
put_hevc_pel_bi_pixels16_8     42718    10421   4194304       21 3881
put_hevc_pel_bi_pixels24_8    137480    38423     32768 0        0
put_hevc_pel_bi_pixels32_8    172166    43996   8388608 0        3
put_hevc_pel_bi_pixels48_8    520118   133238      4096 0        0
put_hevc_pel_bi_pixels64_8    671892   173615   4194304 0        0
put_hevc_pel_pixels4_8          3859     3139   1048576     8926 9478
put_hevc_pel_pixels6_8          8453     6566     32768 0        0
put_hevc_pel_pixels8_8          7144     3093   4194304     4802 30239
put_hevc_pel_pixels12_8        25096    16648     65536 0        0
put_hevc_pel_pixels16_8        25472     9538   2097152      790 3094
put_hevc_pel_pixels24_8        93108    42948     32768 0        0
put_hevc_pel_pixels32_8       100331    37550   8388608 0        2
put_hevc_pel_pixels48_8       321258   137835      4096 0        0
put_hevc_pel_pixels64_8       387236   152538   4194304 0        0
put_hevc_qpel_bi_h4_8          34054    20498     16384 0        0
put_hevc_qpel_bi_h8_8          34264    10873    524288 0      801
put_hevc_qpel_bi_h12_8         85199    22938     16384 0        0
put_hevc_qpel_bi_h16_8        107035    20526    524288 0      488
put_hevc_qpel_bi_h24_8        323233    66440     16384 0        0
put_hevc_qpel_bi_h32_8        415699    76073    262144 0        0
put_hevc_qpel_bi_h48_8       1282990   246145      2048 0        0
put_hevc_qpel_bi_h64_8       1664853   260382    262144 0        0
put_hevc_qpel_bi_hv4_8         56239    31221     32768 0        0
put_hevc_qpel_bi_hv8_8         63859    21595   1048576 0       63
put_hevc_qpel_bi_hv12_8       143173    58139     65536 0        0
put_hevc_qpel_bi_hv16_8       184410    40468   1048576 0       15
put_hevc_qpel_bi_hv24_8       509364   134833     32768 0        0
put_hevc_qpel_bi_hv32_8       647015   125581    524288 0        0
put_hevc_qpel_bi_hv48_8      1929283   385204      4096 0        0
put_hevc_qpel_bi_hv64_8      2416442   430161    524288 0        0
put_hevc_qpel_bi_v4_8          37454    22461     32768 0        0
put_hevc_qpel_bi_v8_8          34500     9218   1048576        0 1291
put_hevc_qpel_bi_v12_8         87403    31659     32768 0        0
put_hevc_qpel_bi_v16_8        106589    19326   1048576 0      971
put_hevc_qpel_bi_v24_8        332644    78044     16384 0        0
put_hevc_qpel_bi_v32_8        405835    73886    524288 0        0
put_hevc_qpel_bi_v48_8       1266494   217496      2048 0        0
put_hevc_qpel_bi_v64_8       1677771   259481    524288 0        0
put_hevc_qpel_h4_8             29542    16982     16384 0        0
put_hevc_qpel_h8_8             26710    10452    524288 5      558
put_hevc_qpel_h12_8            67708    22021     16384 0        0
put_hevc_qpel_h16_8            81849    18637    524288 0      560
put_hevc_qpel_h24_8           258384    62392     16384 0        0
put_hevc_qpel_h32_8           321281    68451    262144 0        0
put_hevc_qpel_h48_8           984759   219657      2048 0        0
put_hevc_qpel_h64_8          1224717   227914    262144 0        0
put_hevc_qpel_hv4_8            51764    32150     32768 0        0
put_hevc_qpel_hv8_8            56369    21627   1048576 0       73
put_hevc_qpel_hv12_8          125191    48671     65536 0        0
put_hevc_qpel_hv16_8          159288    40749   1048576 0       10
put_hevc_qpel_hv24_8          438656   131331     32768 0        0
put_hevc_qpel_hv32_8          551607   121954    524288 0        0
put_hevc_qpel_hv48_8         1627266   397656      4096 0        0
put_hevc_qpel_hv64_8         2016176   414765    524288 0        0
put_hevc_qpel_uni_h4_8         21301    13384    131072 0        0
put_hevc_qpel_uni_h8_8         30057    11010    524288 7      486
put_hevc_qpel_uni_h12_8        84804    25790     16384 0        0
put_hevc_qpel_uni_h16_8        95333    24267    262144 0       17
put_hevc_qpel_uni_h24_8       318029    76951      4096 0        0
put_hevc_qpel_uni_h32_8       356799    72279     65536 0        0
put_hevc_qpel_uni_h48_8      1181308   237731       128 0        0
put_hevc_qpel_uni_h64_8      1401262   231221     16384 0        0
put_hevc_qpel_uni_hv4_8        39439    22837    262144 0        1
put_hevc_qpel_uni_hv8_8        60380    23283   1048576 0       77
put_hevc_qpel_uni_hv12_8      146759    56280     32768 0        0
put_hevc_qpel_uni_hv16_8      173329    45131    524288 0        2
put_hevc_qpel_uni_hv24_8      505434   139999     16384 0        0
put_hevc_qpel_uni_hv32_8      561402   120361    131072 0        0
put_hevc_qpel_uni_hv48_8     1854753   361780       256 0        0
put_hevc_qpel_uni_hv64_8     2142627   404073     32768 0        0
put_hevc_qpel_uni_v4_8         23081    12550    262144 0        0
put_hevc_qpel_uni_v8_8         30075     9971   1048576 5      511
put_hevc_qpel_uni_v12_8        89427    38025     16384 0        0
put_hevc_qpel_uni_v16_8        96131    21727    524288 0       23
put_hevc_qpel_uni_v24_8       328019    90689      8192 0        0
put_hevc_qpel_uni_v32_8       358340    71396    131072 0        0
put_hevc_qpel_uni_v48_8      1164812   176367       256 0        0
put_hevc_qpel_uni_v64_8      1464856   232866     32768 0        0
put_hevc_qpel_v4_8             31732    19999     32768 0        0
put_hevc_qpel_v8_8             25311     8967   1048576       10 1142
put_hevc_qpel_v12_8            67764    29917     32768 0        0
put_hevc_qpel_v16_8            78023    18260   1048576 0      819
put_hevc_qpel_v24_8           254724    75185     16384 0        0
put_hevc_qpel_v32_8           305639    69130    524288 0        0
put_hevc_qpel_v48_8           892900   240703      2048 0        0
put_hevc_qpel_v64_8          1149597   221632    524288 0        0
sao_edge_filter_8             600074    91811    524288 0        0
Shengbin Meng Nov. 21, 2017, 10:51 a.m. UTC | #5
> On 19 Nov 2017, at 01:35, Rafal Dabrowa <fatwildcat@gmail.com> wrote:
> 
> 
> This is a proposal of performance optimizations for 8-bit
> hevc video decoding on aarch64 platform with neon (simd) extension.

Nice to see the work for aarch64! 

We are also in the process of doing NEON optimization for HEVC decoding. (http://ffmpeg.org/pipermail/ffmpeg-devel/2017-October/218233.html <http://ffmpeg.org/pipermail/ffmpeg-devel/2017-October/218233.html>)

Now we are just about to finish arm 32-bit work and ready to send some patches out. Looks like for aarch64 we can join force:) What do you think?

> 
> The patch contains optimizations for most heavily used qpel, epel, sao and idct
> functions.  Among the functions provided for optimization there are two
> intensively used, but not optimized in this patch: hevc_v_loop_filter_luma_8
> and hevc_h_loop_filter_luma_8. I have no idea how they could be optimized
> hence I leaved them without optimizations.
> 

I see that optimization for loop filter already exists for arm 32-bit code. Why not use that algorithm?


Regards,
Shengbin
Rafal Dabrowa Nov. 21, 2017, 2:24 p.m. UTC | #6
On 11/21/2017 11:51 AM, Shengbin Meng wrote:
>
>> On 19 Nov 2017, at 01:35, Rafal Dabrowa <fatwildcat@gmail.com 
>> <mailto:fatwildcat@gmail.com>> wrote:
>>
>>
>> This is a proposal of performance optimizations for 8-bit
>> hevc video decoding on aarch64 platform with neon (simd) extension.
>
> Nice to see the work for aarch64!
>
> We are also in the process of doing NEON optimization for HEVC 
> decoding. 
> (http://ffmpeg.org/pipermail/ffmpeg-devel/2017-October/218233.html)
>
> Now we are just about to finish arm 32-bit work and ready to send some 
> patches out. Looks like for aarch64 we can join force:) What do you think?
Why not. I started to work on aarch64 because my device, although has 
VPU, but the VPU does not support hevc. Hence the h264 format, even full 
HD one is played smoothly but playback of hevc looks poorly. I was 
curious how much hevc decoding might be optimized. I optimized one 
function, then another one...

Currently I'm focused on patch size reduction. But I'm open to cooperation.

>
>>
>> The patch contains optimizations for most heavily used qpel, epel, 
>> sao and idct
>> functions.  Among the functions provided for optimization there are two
>> intensively used, but not optimized in this patch: 
>> hevc_v_loop_filter_luma_8
>> and hevc_h_loop_filter_luma_8. I have no idea how they could be optimized
>> hence I leaved them without optimizations.
>>
>
> I see that optimization for loop filter already exists for arm 32-bit 
> code. Why not use that algorithm?

Maybe... Although optimization for aarch64 is a different story. I have 
noticed that gcc with -O3 option on aarch64 produces really good code. I 
was surprised how much the code execution time is reduced in some cases. 
Sometimes it is hard to optimize code better than compiler does.


Rafal Dabrowa
Clément Bœsch Nov. 25, 2017, 8:25 a.m. UTC | #7
On Sat, Nov 18, 2017 at 06:35:48PM +0100, Rafal Dabrowa wrote:
> 
> This is a proposal of performance optimizations for 8-bit
> hevc video decoding on aarch64 platform with neon (simd) extension.
> 
> I'm testing my optimizations on NanoPi M3 device. I'm using
> mainly "Big Buck Bunny" video file in format 1280x720 for testing.
> The video file was pulled from libde265.org page, see
> http://www.libde265.org/hevc-bitstreams/bbb-1280x720-cfg06.mkv
> The movie duration is 00:10:34.53.
> 
> Overall performance gain is about 2x. Without optimizations the movie
> playback stops in practice after a few seconds. With
> optimizations the file is played smoothly 99% of the time.
> 
> For performance testing the following command was used:
> 
>     time ./ffmpeg -hide_banner -i ~/bbb-1280x720-cfg06.mkv -f yuv4mpegpipe - >/dev/null
> 
> The video file was pre-read before test to minimize disk reads during testing.
> Program execution time without optimization was as follows:
> 
> real	11m48.576s
> user	43m8.111s
> sys	0m12.469s
> 
> Execution time with optimizations:
> 
> real	6m17.046s
> user	21m19.792s
> sys	0m14.724s
> 

Can you post the results of checkasm --bench for hevc?

Did you run it to check for any calling convention violation?

> 
> The patch contains optimizations for most heavily used qpel, epel, sao and idct
> functions.  Among the functions provided for optimization there are two
> intensively used, but not optimized in this patch: hevc_v_loop_filter_luma_8
> and hevc_h_loop_filter_luma_8. I have no idea how they could be optimized
> hence I leaved them without optimizations.
> 

You may want to check x86/hevc_deblock.asm then (no idea if these are
implemented).

[...]
> +function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1
> +    mov     x7, 128
> +1:  ld1     { v0.s }[0], [x1], x2
> +    ushll   v4.8h, v0.8b, 6

> +    st1     { v4.d }[0], [x0], x7

using #128 not possible?

> +    subs    x3, x3, 1
> +    b.ne    1b
> +    ret

here and below: no use of the x6 register?

A few comments on the style:

- please use a consistent spacing (current function mismatches with later
  code), preferably using a relatively large number of spaces as common
  ground (check the other sources)
- we use capitalized size suffixes (B, H, ...); and IIRC the lower case
  form are problematic with some assembler but don't quote me on that.
- we don't use spaces between {}

> +endfunc
> +
> +function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1
> +    mov     x7, 120
> +1:  ld1     { v0.8b }, [x1], x2
> +    ushll   v4.8h, v0.8b, 6

> +    st1     { v4.d }[0], [x0], 8

I think you need to use # as prefix for the immediates

> +    st1     { v4.s }[2], [x0], x7

I assume you can't use #120?

Have you checked if using #128 here and decrementing x0 afterward isn't
faster?

[...]
> +function ff_hevc_put_hevc_pel_bi_pixels32_8_neon, export=1
> +    mov         x10, 128
> +1:  ld1         { v0.16b, v1.16b }, [x2], x3        // src
> +    ushll       v16.8h, v0.8b, 6
> +    ushll2      v17.8h, v0.16b, 6
> +    ushll       v18.8h, v1.8b, 6
> +    ushll2      v19.8h, v1.16b, 6
> +    ld1         { v20.8h, v21.8h, v22.8h, v23.8h }, [x4], x10   // src2
> +    sqadd       v16.8h, v16.8h, v20.8h
> +    sqadd       v17.8h, v17.8h, v21.8h
> +    sqadd       v18.8h, v18.8h, v22.8h
> +    sqadd       v19.8h, v19.8h, v23.8h

> +    sqrshrun    v0.8b,  v16.8h, 7
> +    sqrshrun2   v0.16b, v17.8h, 7
> +    sqrshrun    v1.8b,  v18.8h, 7
> +    sqrshrun2   v1.16b, v19.8h, 7

does pairing helps here?

    sqrshrun    v0.8b,  v16.8h, 7
    sqrshrun    v1.8b,  v18.8h, 7
    sqrshrun2   v0.16b, v17.8h, 7
    sqrshrun2   v1.16b, v19.8h, 7

[...]
> +    sqrshrun    v0.8b,  v16.8h, 7
> +    sqrshrun2   v0.16b, v17.8h, 7
> +    sqrshrun    v1.8b,  v18.8h, 7
> +    sqrshrun2   v1.16b, v19.8h, 7
> +    sqrshrun    v2.8b,  v20.8h, 7
> +    sqrshrun2   v2.16b, v21.8h, 7
> +    sqrshrun    v3.8b,  v22.8h, 7
> +    sqrshrun2   v3.16b, v23.8h, 7

Again, this might be a good candidate for attempting to shuffle the
instructions and see if it helps (there are many other places, I picked
one randomly).

> +.Lepel_filters:

const/endconst + align might be better for all these labels

[...]
> +function ff_hevc_put_hevc_epel_hv12_8_neon, export=1
> +    add         x10, x3, 3
> +    lsl         x10, x10, 7
> +    sub         sp, sp, x10     // tmp_array
> +    stp         x0, x3, [sp, -16]!
> +    stp         x5, x30, [sp, -16]!
> +    add         x0, sp, 32
> +    sub         x1, x1, x2
> +    add         x3, x3, 3
> +    bl          ff_hevc_put_hevc_epel_h12_8_neon
> +    ldp         x5, x30, [sp], 16
> +    ldp         x0, x3, [sp], 16
> +    load_epel_filterh x5, x4
> +    mov         x5, 112
> +    mov         x10, 128
> +    ld1         { v16.8h, v17.8h }, [sp], x10
> +    ld1         { v18.8h, v19.8h }, [sp], x10
> +    ld1         { v20.8h, v21.8h }, [sp], x10
> +1:  ld1         { v22.8h, v23.8h }, [sp], x10
> +    calc_epelh  v4, v16, v18, v20, v22
> +    calc_epelh2 v4, v5, v16, v18, v20, v22
> +    calc_epelh  v5, v17, v19, v21, v23
> +    st1         { v4.8h }, [x0], 16
> +    st1         { v5.4h }, [x0], x5
> +    subs        x3, x3, 1
> +    b.eq        2f
> +

> +    ld1         { v16.8h, v17.8h }, [sp], x10
> +    calc_epelh  v4, v18, v20, v22, v16
> +    calc_epelh2 v4, v5, v18, v20, v22, v16
> +    calc_epelh  v5, v19, v21, v23, v17
> +    st1         { v4.8h }, [x0], 16
> +    st1         { v5.4h }, [x0], x5
> +    subs        x3, x3, 1
> +    b.eq        2f
> +
> +    ld1         { v18.8h, v19.8h }, [sp], x10
> +    calc_epelh  v4, v20, v22, v16, v18
> +    calc_epelh2 v4, v5, v20, v22, v16, v18
> +    calc_epelh  v5, v21, v23, v17, v19
> +    st1         { v4.8h }, [x0], 16
> +    st1         { v5.4h }, [x0], x5
> +    subs        x3, x3, 1
> +    b.eq        2f
> +
> +    ld1         { v20.8h, v21.8h }, [sp], x10
> +    calc_epelh  v4, v22, v16, v18, v20
> +    calc_epelh2 v4, v5, v22, v16, v18, v20
> +    calc_epelh  v5, v23, v17, v19, v21
> +    st1         { v4.8h }, [x0], 16
> +    st1         { v5.4h }, [x0], x5
> +    subs        x3, x3, 1
> +    b.ne        1b

Introducing macros probably makes sense in these functions

[...]
> +8:  b           9f                              // 0
> +    nop
> +    nop
> +    nop
> +    st1         { v29.b }[0], [x7]              // 1
> +    b           9f
> +    nop
> +    nop
> +    st1         { v29.h }[0], [x7]              // 2
> +    b           9f
> +    nop
> +    nop
> +    st1         { v29.h }[0], [x7], 2           // 3
> +    st1         { v29.b }[2], [x7]
> +    b           9f
> +    nop
> +    st1         { v29.s }[0], [x7]              // 4
> +    b           9f
> +    nop
> +    nop
> +    st1         { v29.s }[0], [x7], 4           // 5
> +    st1         { v29.b }[4], [x7]
> +    b           9f
> +    nop
> +    st1         { v29.s }[0], [x7], 4           // 6
> +    st1         { v29.h }[2], [x7]
> +    b           9f
> +    nop
> +    st1         { v29.s }[0], [x7], 4           // 7
> +    st1         { v29.h }[2], [x7], 2
> +    st1         { v29.b }[6], [x7]

What are these nops for? align?

[...]

Anyway, can you split your patch? It's really a lot of code and there is
no way anyone can review it properly quickly.

I also think macros would be welcome in many places to reduce the size of
the patch(es).

Regards,
diff mbox

Patch

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 72080c2dbb..f03814062c 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -14,6 +14,7 @@  OBJS-$(CONFIG_VIDEODSP)                 += aarch64/videodsp_init.o
 OBJS-$(CONFIG_AAC_DECODER)              += aarch64/aacpsdsp_init_aarch64.o \
                                            aarch64/sbrdsp_init_aarch64.o
 OBJS-$(CONFIG_DCA_DECODER)              += aarch64/synth_filter_init.o
+OBJS-$(CONFIG_HEVC_DECODER)             += aarch64/hevcdsp_init_aarch64.o
 OBJS-$(CONFIG_RV40_DECODER)             += aarch64/rv40dsp_init_aarch64.o
 OBJS-$(CONFIG_VC1DSP)                   += aarch64/vc1dsp_init_aarch64.o
 OBJS-$(CONFIG_VORBIS_DECODER)           += aarch64/vorbisdsp_init.o
@@ -47,6 +48,10 @@  NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
 # decoders/encoders
 NEON-OBJS-$(CONFIG_AAC_DECODER)         += aarch64/aacpsdsp_neon.o
 NEON-OBJS-$(CONFIG_DCA_DECODER)         += aarch64/synth_filter_neon.o
+NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_epel_8.o \
+                                           aarch64/hevcdsp_qpel_8.o \
+                                           aarch64/hevcdsp_sao_8.o \
+                                           aarch64/hevcdsp_idct_8.o
 NEON-OBJS-$(CONFIG_VORBIS_DECODER)      += aarch64/vorbisdsp_neon.o
 NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \
                                            aarch64/vp9itxfm_neon.o             \
diff --git a/libavcodec/aarch64/hevcdsp_epel_8.S b/libavcodec/aarch64/hevcdsp_epel_8.S
new file mode 100644
index 0000000000..508a7276f4
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_epel_8.S
@@ -0,0 +1,3949 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1
+    mov     x7, 128
+1:  ld1     { v0.s }[0], [x1], x2
+    ushll   v4.8h, v0.8b, 6
+    st1     { v4.d }[0], [x0], x7
+    subs    x3, x3, 1
+    b.ne    1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1
+    mov     x7, 120
+1:  ld1     { v0.8b }, [x1], x2
+    ushll   v4.8h, v0.8b, 6
+    st1     { v4.d }[0], [x0], 8
+    st1     { v4.s }[2], [x0], x7
+    subs    x3, x3, 1
+    b.ne    1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1
+    mov     x7, 128
+1:  ld1     { v0.8b }, [x1], x2
+    ushll   v4.8h, v0.8b, 6
+    st1     { v4.8h }, [x0], x7
+    subs    x3, x3, 1
+    b.ne    1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1
+    mov     x7, 112
+1:  ld1     { v0.8b, v1.8b }, [x1], x2
+    ushll   v4.8h, v0.8b, 6
+    st1     { v4.8h }, [x0], 16
+    ushll   v5.8h, v1.8b, 6
+    st1     { v5.d }[0], [x0], x7
+    subs    x3, x3, 1
+    b.ne    1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1
+    mov     x7, 128
+1:  ld1     { v0.8b, v1.8b }, [x1], x2
+    ushll   v4.8h, v0.8b, 6
+    ushll   v5.8h, v1.8b, 6
+    st1     { v4.8h, v5.8h }, [x0], x7
+    subs    x3, x3, 1
+    b.ne    1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1
+    mov     x7, 128
+1:  ld1     { v0.8b, v1.8b, v2.8b }, [x1], x2
+    ushll   v4.8h, v0.8b, 6
+    ushll   v5.8h, v1.8b, 6
+    ushll   v6.8h, v2.8b, 6
+    st1     { v4.8h, v5.8h, v6.8h }, [x0], x7
+    subs    x3, x3, 1
+    b.ne    1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1
+    mov     x7, 128
+1:  ld1     { v0.8b, v1.8b, v2.8b, v3.8b }, [x1], x2
+    ushll   v4.8h, v0.8b, 6
+    ushll   v5.8h, v1.8b, 6
+    ushll   v6.8h, v2.8b, 6
+    ushll   v7.8h, v3.8b, 6
+    st1     { v4.8h, v5.8h, v6.8h, v7.8h }, [x0], x7
+    subs    x3, x3, 1
+    b.ne    1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels48_8_neon, export=1
+    mov     x7, 64
+1:  ld1     { v0.16b, v1.16b, v2.16b }, [x1], x2
+    ushll   v4.8h, v0.8b, 6
+    ushll2  v5.8h, v0.16b, 6
+    ushll   v6.8h, v1.8b, 6
+    ushll2  v7.8h, v1.16b, 6
+    st1     { v4.8h, v5.8h, v6.8h, v7.8h }, [x0], 64
+    ushll   v4.8h, v2.8b, 6
+    ushll2  v5.8h, v2.16b, 6
+    st1     { v4.8h, v5.8h }, [x0], x7
+    subs    x3, x3, 1
+    b.ne    1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
+1:  ld1     { v0.16b, v1.16b, v2.16b, v3.16b }, [x1], x2
+    ushll   v4.8h, v0.8b, 6
+    ushll2  v5.8h, v0.16b, 6
+    ushll   v6.8h, v1.8b, 6
+    ushll2  v7.8h, v1.16b, 6
+    st1     { v4.8h, v5.8h, v6.8h, v7.8h }, [x0], 64
+    ushll   v4.8h, v2.8b, 6
+    ushll2  v5.8h, v2.16b, 6
+    ushll   v6.8h, v3.8b, 6
+    ushll2  v7.8h, v3.16b, 6
+    st1     { v4.8h, v5.8h, v6.8h, v7.8h }, [x0], 64
+    subs    x3, x3, 1
+    b.ne    1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels4_8_neon, export=1
+    mov         x10, 128
+1:  ld1         { v0.s }[0], [x2], x3       // src
+    ushll       v16.8h, v0.8b, 6
+    ld1         { v20.4h }, [x4], x10   // src2
+    sqadd       v16.8h, v16.8h, v20.8h
+    sqrshrun    v0.8b,  v16.8h, 7
+    st1         { v0.s }[0], [x0], x1
+    subs        x5, x5, 1
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels6_8_neon, export=1
+    mov         x10, 120
+    sub         x1, x1, 4
+1:  ld1         { v0.8b }, [x2], x3
+    ushll       v16.8h, v0.8b, 6
+    ld1         { v20.4h }, [x4], 8
+    ld1         { v20.s }[2], [x4], x10
+    sqadd       v16.8h, v16.8h, v20.8h
+    sqrshrun    v0.8b,  v16.8h, 7
+    st1         { v0.s }[0], [x0], 4
+    st1         { v0.h }[2], [x0], x1
+    subs        x5, x5, 1
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels8_8_neon, export=1
+    mov         x10, 128
+1:  ld1         { v0.8b }, [x2], x3     // src
+    ushll       v16.8h, v0.8b, 6
+    ld1         { v20.8h }, [x4], x10   // src2
+    sqadd       v16.8h, v16.8h, v20.8h
+    sqrshrun    v0.8b,  v16.8h, 7
+    st1         { v0.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels12_8_neon, export=1
+    mov         x10, 112
+    sub         x1, x1, 8
+1:  ld1         { v0.16b }, [x2], x3
+    ushll       v16.8h, v0.8b, 6
+    ushll2      v17.8h, v0.16b, 6
+    ld1         { v20.8h }, [x4], 16
+    ld1         { v21.4h }, [x4], x10
+    sqadd       v16.8h, v16.8h, v20.8h
+    sqadd       v17.8h, v17.8h, v21.8h
+    sqrshrun    v0.8b,  v16.8h, 7
+    sqrshrun2   v0.16b, v17.8h, 7
+    st1         { v0.8b }, [x0], 8
+    st1         { v0.s }[2], [x0], x1
+    subs        x5, x5, 1
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels16_8_neon, export=1
+    mov         x10, 128
+1:  ld1         { v0.16b }, [x2], x3        // src
+    ushll       v16.8h, v0.8b, 6
+    ushll2      v17.8h, v0.16b, 6
+    ld1         { v20.8h, v21.8h }, [x4], x10   // src2
+    sqadd       v16.8h, v16.8h, v20.8h
+    sqadd       v17.8h, v17.8h, v21.8h
+    sqrshrun    v0.8b,  v16.8h, 7
+    sqrshrun2   v0.16b, v17.8h, 7
+    st1         { v0.16b }, [x0], x1
+    subs        x5, x5, 1
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels24_8_neon, export=1
+    mov         x10, 128
+1:  ld1         { v0.8b, v1.8b, v2.8b }, [x2], x3       // src
+    ushll       v16.8h, v0.8b, 6
+    ushll       v17.8h, v1.8b, 6
+    ushll       v18.8h, v2.8b, 6
+    ld1         { v20.8h, v21.8h, v22.8h }, [x4], x10   // src2
+    sqadd       v16.8h, v16.8h, v20.8h
+    sqadd       v17.8h, v17.8h, v21.8h
+    sqadd       v18.8h, v18.8h, v22.8h
+    sqrshrun    v0.8b, v16.8h, 7
+    sqrshrun    v1.8b, v17.8h, 7
+    sqrshrun    v2.8b, v18.8h, 7
+    st1         { v0.8b, v1.8b, v2.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels32_8_neon, export=1
+    mov         x10, 128
+1:  ld1         { v0.16b, v1.16b }, [x2], x3        // src
+    ushll       v16.8h, v0.8b, 6
+    ushll2      v17.8h, v0.16b, 6
+    ushll       v18.8h, v1.8b, 6
+    ushll2      v19.8h, v1.16b, 6
+    ld1         { v20.8h, v21.8h, v22.8h, v23.8h }, [x4], x10   // src2
+    sqadd       v16.8h, v16.8h, v20.8h
+    sqadd       v17.8h, v17.8h, v21.8h
+    sqadd       v18.8h, v18.8h, v22.8h
+    sqadd       v19.8h, v19.8h, v23.8h
+    sqrshrun    v0.8b,  v16.8h, 7
+    sqrshrun2   v0.16b, v17.8h, 7
+    sqrshrun    v1.8b,  v18.8h, 7
+    sqrshrun2   v1.16b, v19.8h, 7
+    st1         { v0.16b, v1.16b }, [x0], x1
+    subs        x5, x5, 1
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels48_8_neon, export=1
+    mov         x10, 64
+1:  ld1         { v0.16b, v1.16b, v2.16b }, [x2], x3        // src
+    ushll       v16.8h, v0.8b, 6
+    ushll2      v17.8h, v0.16b, 6
+    ushll       v18.8h, v1.8b, 6
+    ushll2      v19.8h, v1.16b, 6
+    ushll       v20.8h, v2.8b, 6
+    ushll2      v21.8h, v2.16b, 6
+    ld1         { v24.8h, v25.8h, v26.8h, v27.8h }, [x4], 64    // src2
+    sqadd       v16.8h, v16.8h, v24.8h
+    sqadd       v17.8h, v17.8h, v25.8h
+    sqadd       v18.8h, v18.8h, v26.8h
+    sqadd       v19.8h, v19.8h, v27.8h
+    ld1         { v24.8h, v25.8h }, [x4], x10
+    sqadd       v20.8h, v20.8h, v24.8h
+    sqadd       v21.8h, v21.8h, v25.8h
+    sqrshrun    v0.8b,  v16.8h, 7
+    sqrshrun2   v0.16b, v17.8h, 7
+    sqrshrun    v1.8b,  v18.8h, 7
+    sqrshrun2   v1.16b, v19.8h, 7
+    sqrshrun    v2.8b,  v20.8h, 7
+    sqrshrun2   v2.16b, v21.8h, 7
+    st1         { v0.16b, v1.16b, v2.16b }, [x0], x1
+    subs        x5, x5, 1
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels64_8_neon, export=1
+1:  ld1         { v0.16b, v1.16b, v2.16b, v3.16b }, [x2], x3        // src
+    ushll       v16.8h, v0.8b, 6
+    ushll2      v17.8h, v0.16b, 6
+    ushll       v18.8h, v1.8b, 6
+    ushll2      v19.8h, v1.16b, 6
+    ushll       v20.8h, v2.8b, 6
+    ushll2      v21.8h, v2.16b, 6
+    ushll       v22.8h, v3.8b, 6
+    ushll2      v23.8h, v3.16b, 6
+    ld1         { v24.8h, v25.8h, v26.8h, v27.8h }, [x4], 64    // src2
+    sqadd       v16.8h, v16.8h, v24.8h
+    sqadd       v17.8h, v17.8h, v25.8h
+    sqadd       v18.8h, v18.8h, v26.8h
+    sqadd       v19.8h, v19.8h, v27.8h
+    ld1         { v24.8h, v25.8h, v26.8h, v27.8h }, [x4], 64
+    sqadd       v20.8h, v20.8h, v24.8h
+    sqadd       v21.8h, v21.8h, v25.8h
+    sqadd       v22.8h, v22.8h, v26.8h
+    sqadd       v23.8h, v23.8h, v27.8h
+    sqrshrun    v0.8b,  v16.8h, 7
+    sqrshrun2   v0.16b, v17.8h, 7
+    sqrshrun    v1.8b,  v18.8h, 7
+    sqrshrun2   v1.16b, v19.8h, 7
+    sqrshrun    v2.8b,  v20.8h, 7
+    sqrshrun2   v2.16b, v21.8h, 7
+    sqrshrun    v3.8b,  v22.8h, 7
+    sqrshrun2   v3.16b, v23.8h, 7
+    st1         { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], x1
+    subs        x5, x5, 1
+    b.ne        1b
+    ret
+endfunc
+
+
+.Lepel_filters:
+    .byte  0,  0,  0,  0
+    .byte -2, 58, 10, -2
+    .byte -4, 54, 16, -2
+    .byte -6, 46, 28, -4
+    .byte -4, 36, 36, -4
+    .byte -4, 28, 46, -6
+    .byte -2, 16, 54, -4
+    .byte -2, 10, 58, -2
+
+.macro load_epel_filterb    freg, xreg
+    adr         \xreg, .Lepel_filters
+    add         \xreg, \xreg, \freg, lsl 2
+    ld4r        { v0.16b, v1.16b, v2.16b, v3.16b }, [\xreg] // filter
+    neg         v0.16b, v0.16b
+    neg         v3.16b, v3.16b
+.endm
+
+.macro calc_epelb dst, src1, src2, src3, src4
+    umlsl       \dst\().8h, \src1\().8b, v0.8b
+    umlal       \dst\().8h, \src2\().8b, v1.8b
+    umlal       \dst\().8h, \src3\().8b, v2.8b
+    umlsl       \dst\().8h, \src4\().8b, v3.8b
+.endm
+
+.macro calc_epelb2 dst, src1, src2, src3, src4
+    umlsl2      \dst\().8h, \src1\().16b, v0.16b
+    umlal2      \dst\().8h, \src2\().16b, v1.16b
+    umlal2      \dst\().8h, \src3\().16b, v2.16b
+    umlsl2      \dst\().8h, \src4\().16b, v3.16b
+.endm
+
+.macro load_epel_filterh freg, xreg
+    adr         \xreg, .Lepel_filters
+    add         \xreg, \xreg, \freg, lsl 2
+    ld1         { v0.8b }, [\xreg]
+    sxtl        v0.8h, v0.8b
+.endm
+
+.macro calc_epelh dst, src1, src2, src3, src4
+    smull       \dst\().4s, \src1\().4h, v0.h[0]
+    smlal       \dst\().4s, \src2\().4h, v0.h[1]
+    smlal       \dst\().4s, \src3\().4h, v0.h[2]
+    smlal       \dst\().4s, \src4\().4h, v0.h[3]
+    sqshrn      \dst\().4h, \dst\().4s, 6
+.endm
+
+.macro calc_epelh2 dst, dsttmp, src1, src2, src3, src4
+    smull2      \dsttmp\().4s, \src1\().8h, v0.h[0]
+    smlal2      \dsttmp\().4s, \src2\().8h, v0.h[1]
+    smlal2      \dsttmp\().4s, \src3\().8h, v0.h[2]
+    smlal2      \dsttmp\().4s, \src4\().8h, v0.h[3]
+    sqshrn2     \dst\().8h, \dsttmp\().4s, 6
+.endm
+
+
+function ff_hevc_put_hevc_epel_h4_8_neon, export=1
+    load_epel_filterb x4, x5
+    sub         x1, x1, 1
+    mov         x10, 128
+1:  ld1         { v4.8b }, [x1], x2
+    ushr        v5.2d, v4.2d, 8
+    ushr        v6.2d, v5.2d, 8
+    ushr        v7.2d, v6.2d, 8
+    movi        v16.8h, 0
+    calc_epelb  v16, v4, v5, v6, v7
+    st1         { v16.4h }, [x0], x10
+    subs        x3, x3, 1   // height
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h6_8_neon, export=1
+    load_epel_filterb x4, x5
+    sub         x1, x1, 1
+    sub         x2, x2, 8
+    mov         x10, 120
+1:  ld1         { v24.8b }, [x1], 8
+    ushr        v26.2d, v24.2d, 8
+    ushr        v27.2d, v26.2d, 8
+    ushr        v28.2d, v27.2d, 8
+    movi        v16.8h, 0
+    ld1         { v28.b }[5], [x1], x2
+    calc_epelb  v16, v24, v26, v27, v28
+    st1         { v16.4h }, [x0], 8
+    st1         { v16.s }[2], [x0], x10
+    subs        x3, x3, 1   // height
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h8_8_neon, export=1
+    load_epel_filterb x4, x5
+    sub         x1, x1, 1
+    mov         x10, 128
+1:  ld2         { v24.8b, v25.8b }, [x1], x2
+    ushr        v26.2d, v24.2d, 8
+    ushr        v27.2d, v25.2d, 8
+    ushr        v28.2d, v26.2d, 8
+    movi        v16.8h, 0
+    movi        v17.8h, 0
+    calc_epelb  v16, v24, v25, v26, v27
+    calc_epelb  v17, v25, v26, v27, v28
+    st2         { v16.4h, v17.4h }, [x0], x10
+    subs        x3, x3, 1   // height
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h12_8_neon, export=1
+    load_epel_filterb x4, x5
+    sub         x1, x1, 1
+    mov         x10, 112
+1:  ld2         { v24.8b, v25.8b }, [x1], x2
+    ushr        v26.2d, v24.2d, 8
+    ushr        v27.2d, v25.2d, 8
+    ushr        v28.2d, v26.2d, 8
+    movi        v16.8h, 0
+    movi        v17.8h, 0
+    calc_epelb  v16, v24, v25, v26, v27
+    calc_epelb  v17, v25, v26, v27, v28
+    zip1        v18.8h, v16.8h, v17.8h
+    zip2        v19.8h, v16.8h, v17.8h
+    st1         { v18.8h }, [x0], 16
+    st1         { v19.d }[0], [x0], x10
+    subs        x3, x3, 1   // height
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h16_8_neon, export=1
+    load_epel_filterb x4, x5
+    sub         x1, x1, 1
+    sub         x2, x2, 16
+    mov         x10, 128
+1:  ld2         { v24.8b, v25.8b }, [x1], 16
+    ld1         { v20.s }[0], [x1], x2
+    ushr        v26.2d, v24.2d, 8
+    ushr        v27.2d, v25.2d, 8
+    mov         v26.b[7], v20.b[0]
+    mov         v27.b[7], v20.b[1]
+    ushr        v28.2d, v26.2d, 8
+    mov         v28.b[7], v20.b[2]
+    movi        v16.8h, 0
+    movi        v17.8h, 0
+    calc_epelb  v16, v24, v25, v26, v27
+    calc_epelb  v17, v25, v26, v27, v28
+    st2         { v16.8h, v17.8h }, [x0], x10
+    subs        x3, x3, 1   // height
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h24_8_neon, export=1
+    load_epel_filterb x4, x5
+    sub         x1, x1, 1
+    sub         x2, x2, 24
+    mov         x10, 128
+1:  ld3         { v24.8b, v25.8b, v26.8b }, [x1], 24
+    ld1         { v20.s }[0], [x1], x2
+    ushr        v27.2d, v24.2d, 8
+    ushr        v28.2d, v25.2d, 8
+    ushr        v29.2d, v26.2d, 8
+    mov         v27.b[7], v20.b[0]
+    mov         v28.b[7], v20.b[1]
+    mov         v29.b[7], v20.b[2]
+    movi        v16.8h, 0
+    movi        v17.8h, 0
+    movi        v18.8h, 0
+    calc_epelb  v16, v24, v25, v26, v27
+    calc_epelb  v17, v25, v26, v27, v28
+    calc_epelb  v18, v26, v27, v28, v29
+    st3         { v16.8h, v17.8h, v18.8h }, [x0], x10
+    subs        x3, x3, 1   // height
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h32_8_neon, export=1
+    load_epel_filterb x4, x5
+    sub         x1, x1, 1
+    sub         x2, x2, 32
+    mov         x10, 128
+1:  ld4         { v24.8b, v25.8b, v26.8b, v27.8b }, [x1], 32
+    ld1         { v20.s }[0], [x1], x2
+    ushr        v28.2d, v24.2d, 8
+    ushr        v29.2d, v25.2d, 8
+    ushr        v30.2d, v26.2d, 8
+    ins         v28.b[7], v20.b[0]
+    ins         v29.b[7], v20.b[1]
+    ins         v30.b[7], v20.b[2]
+    movi        v16.8h, 0
+    movi        v17.8h, 0
+    movi        v18.8h, 0
+    movi        v19.8h, 0
+    calc_epelb  v16, v24, v25, v26, v27
+    calc_epelb  v17, v25, v26, v27, v28
+    calc_epelb  v18, v26, v27, v28, v29
+    calc_epelb  v19, v27, v28, v29, v30
+    st4         { v16.8h, v17.8h, v18.8h, v19.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h48_8_neon, export=1
+    load_epel_filterb x4, x5
+    sub         x1, x1, 1
+    sub         x2, x2, 48
+    mov         x5, 24
+    mov         x10, 128 - 48
+1:  ld3         { v26.16b, v27.16b, v28.16b }, [x1], x5
+    ushr        v29.2d, v26.2d, 8
+    ushr        v30.2d, v27.2d, 8
+    ushr        v31.2d, v28.2d, 8
+    ld1         { v24.s }[0], [x1], x5
+    ld1         { v25.s }[0], [x1], x2
+    mov         v29.b[7], v24.b[0]
+    mov         v30.b[7], v24.b[1]
+    mov         v31.b[7], v24.b[2]
+    mov         v29.b[15], v25.b[0]
+    mov         v30.b[15], v25.b[1]
+    mov         v31.b[15], v25.b[2]
+    movi        v16.8h, 0
+    movi        v17.8h, 0
+    movi        v18.8h, 0
+    movi        v20.8h, 0
+    movi        v21.8h, 0
+    movi        v22.8h, 0
+    calc_epelb  v16, v26, v27, v28, v29
+    calc_epelb2 v20, v26, v27, v28, v29
+    calc_epelb  v17, v27, v28, v29, v30
+    calc_epelb2 v21, v27, v28, v29, v30
+    calc_epelb  v18, v28, v29, v30, v31
+    calc_epelb2 v22, v28, v29, v30, v31
+    st3         { v16.8h, v17.8h, v18.8h }, [x0], 48
+    st3         { v20.8h, v21.8h, v22.8h }, [x0], x10
+    subs        x3, x3, 1   // height
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h64_8_neon, export=1
+    load_epel_filterb x4, x5
+    sub         x1, x1, 1
+    sub         x2, x2, 64
+    mov         x7, 32
+1:  ld4         { v24.16b, v25.16b, v26.16b, v27.16b }, [x1], x7
+    ushr        v28.2d, v24.2d, 8
+    ushr        v29.2d, v25.2d, 8
+    ushr        v30.2d, v26.2d, 8
+    ld1         { v4.s }[0], [x1], x7
+    ld1         { v5.s }[0], [x1], x2
+    ins         v28.b[7], v4.b[0]
+    ins         v28.b[15], v5.b[0]
+    ins         v29.b[7], v4.b[1]
+    ins         v29.b[15], v5.b[1]
+    ins         v30.b[7], v4.b[2]
+    ins         v30.b[15], v5.b[2]
+    movi        v16.8h, 0
+    movi        v17.8h, 0
+    movi        v18.8h, 0
+    movi        v19.8h, 0
+    movi        v20.8h, 0
+    movi        v21.8h, 0
+    movi        v22.8h, 0
+    movi        v23.8h, 0
+    calc_epelb  v16, v24, v25, v26, v27
+    calc_epelb2 v20, v24, v25, v26, v27
+    calc_epelb  v17, v25, v26, v27, v28
+    calc_epelb2 v21, v25, v26, v27, v28
+    calc_epelb  v18, v26, v27, v28, v29
+    calc_epelb2 v22, v26, v27, v28, v29
+    calc_epelb  v19, v27, v28, v29, v30
+    calc_epelb2 v23, v27, v28, v29, v30
+    st4         { v16.8h, v17.8h, v18.8h, v19.8h }, [x0], 64
+    st4         { v20.8h, v21.8h, v22.8h, v23.8h }, [x0], 64
+    subs        x3, x3, 1
+    b.ne        1b
+    ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_epel_v4_8_neon, export=1
+    load_epel_filterb x5, x4
+    sub         x1, x1, x2
+    mov         x10, 128
+    ld1         { v16.s }[0], [x1], x2
+    ld1         { v17.s }[0], [x1], x2
+    ld1         { v18.s }[0], [x1], x2
+1:  ld1         { v19.s }[0], [x1], x2
+    movi        v4.8h, 0
+    calc_epelb  v4, v16, v17, v18, v19
+    st1         { v4.4h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.s }[0], [x1], x2
+    movi        v4.8h, 0
+    calc_epelb  v4, v17, v18, v19, v16
+    st1         { v4.4h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v17.s }[0], [x1], x2
+    movi        v4.8h, 0
+    calc_epelb  v4, v18, v19, v16, v17
+    st1         { v4.4h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v18.s }[0], [x1], x2
+    movi        v4.8h, 0
+    calc_epelb  v4, v19, v16, v17, v18
+    st1         { v4.4h }, [x0], x10
+    subs        x3, x3, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+
+function ff_hevc_put_hevc_epel_v6_8_neon, export=1
+    load_epel_filterb x5, x4
+    sub         x1, x1, x2
+    mov         x10, 120
+    ld1         { v16.8b }, [x1], x2
+    ld1         { v17.8b }, [x1], x2
+    ld1         { v18.8b }, [x1], x2
+1:  ld1         { v19.8b }, [x1], x2
+    movi        v4.8h, 0
+    calc_epelb  v4 v16, v17, v18, v19
+    st1         { v4.d }[0], [x0], 8
+    st1         { v4.s }[2], [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.8b }, [x1], x2
+    movi        v4.8h, 0
+    calc_epelb  v4 v17, v18, v19, v16
+    st1         { v4.d }[0], [x0], 8
+    st1         { v4.s }[2], [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v17.8b }, [x1], x2
+    movi        v4.8h, 0
+    calc_epelb  v4 v18, v19, v16, v17
+    st1         { v4.d }[0], [x0], 8
+    st1         { v4.s }[2], [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v18.8b }, [x1], x2
+    movi        v4.8h, 0
+    calc_epelb  v4 v19, v16, v17, v18
+    st1         { v4.d }[0], [x0], 8
+    st1         { v4.s }[2], [x0], x10
+    subs        x3, x3, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v8_8_neon, export=1
+    load_epel_filterb x5, x4
+    sub         x1, x1, x2
+    mov         x10, 128
+    ld1         { v16.8b }, [x1], x2
+    ld1         { v17.8b }, [x1], x2
+    ld1         { v18.8b }, [x1], x2
+1:  ld1         { v19.8b }, [x1], x2
+    movi        v4.8h, 0
+    calc_epelb  v4, v16, v17, v18, v19
+    st1         { v4.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.8b }, [x1], x2
+    movi        v4.8h, 0
+    calc_epelb  v4, v17, v18, v19, v16
+    st1         { v4.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v17.8b }, [x1], x2
+    movi        v4.8h, 0
+    calc_epelb  v4, v18, v19, v16, v17
+    st1         { v4.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v18.8b }, [x1], x2
+    movi        v4.8h, 0
+    calc_epelb  v4, v19, v16, v17, v18
+    st1         { v4.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v12_8_neon, export=1
+    load_epel_filterb x5, x4
+    sub         x1, x1, x2
+    mov         x10, 112
+    ld1         { v16.16b }, [x1], x2
+    ld1         { v17.16b }, [x1], x2
+    ld1         { v18.16b }, [x1], x2
+1:  ld1         { v19.16b }, [x1], x2
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    calc_epelb  v4, v16, v17, v18, v19
+    calc_epelb2 v5, v16, v17, v18, v19
+    st1         { v4.8h }, [x0], 16
+    st1         { v5.d }[0], [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.16b }, [x1], x2
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    calc_epelb  v4, v17, v18, v19, v16
+    calc_epelb2 v5, v17, v18, v19, v16
+    st1         { v4.8h }, [x0], 16
+    st1         { v5.d }[0], [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v17.16b }, [x1], x2
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    calc_epelb  v4, v18, v19, v16, v17
+    calc_epelb2 v5, v18, v19, v16, v17
+    st1         { v4.8h }, [x0], 16
+    st1         { v5.d }[0], [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v18.16b }, [x1], x2
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    calc_epelb  v4, v19, v16, v17, v18
+    calc_epelb2 v5, v19, v16, v17, v18
+    st1         { v4.8h }, [x0], 16
+    st1         { v5.d }[0], [x0], x10
+    subs        x3, x3, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v16_8_neon, export=1
+    load_epel_filterb x5, x4
+    sub         x1, x1, x2
+    mov         x10, 128
+    ld1         { v16.16b }, [x1], x2
+    ld1         { v17.16b }, [x1], x2
+    ld1         { v18.16b }, [x1], x2
+1:  ld1         { v19.16b }, [x1], x2
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    calc_epelb  v4, v16, v17, v18, v19
+    calc_epelb2 v5, v16, v17, v18, v19
+    st1         { v4.8h, v5.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.16b }, [x1], x2
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    calc_epelb  v4, v17, v18, v19, v16
+    calc_epelb2 v5, v17, v18, v19, v16
+    st1         { v4.8h, v5.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v17.16b }, [x1], x2
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    calc_epelb  v4, v18, v19, v16, v17
+    calc_epelb2 v5, v18, v19, v16, v17
+    st1         { v4.8h, v5.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v18.16b }, [x1], x2
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    calc_epelb  v4, v19, v16, v17, v18
+    calc_epelb2 v5, v19, v16, v17, v18
+    st1         { v4.8h, v5.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v24_8_neon, export=1
+    load_epel_filterb x5, x4
+    sub         x1, x1, x2
+    mov         x10, 128
+    ld1         { v16.8b, v17.8b, v18.8b }, [x1], x2
+    ld1         { v19.8b, v20.8b, v21.8b }, [x1], x2
+    ld1         { v22.8b, v23.8b, v24.8b }, [x1], x2
+1:  ld1         { v25.8b, v26.8b, v27.8b }, [x1], x2
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    calc_epelb  v4, v16, v19, v22, v25
+    calc_epelb  v5, v17, v20, v23, v26
+    calc_epelb  v6, v18, v21, v24, v27
+    st1         { v4.8h, v5.8h, v6.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.8b, v17.8b, v18.8b }, [x1], x2
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    calc_epelb  v4, v19, v22, v25, v16
+    calc_epelb  v5, v20, v23, v26, v17
+    calc_epelb  v6, v21, v24, v27, v18
+    st1         { v4.8h, v5.8h, v6.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v19.8b, v20.8b, v21.8b }, [x1], x2
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    calc_epelb  v4, v22, v25, v16, v19
+    calc_epelb  v5, v23, v26, v17, v20
+    calc_epelb  v6, v24, v27, v18, v21
+    st1         { v4.8h, v5.8h, v6.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v22.8b, v23.8b, v24.8b }, [x1], x2
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    calc_epelb  v4, v25, v16, v19, v22
+    calc_epelb  v5, v26, v17, v20, v23
+    calc_epelb  v6, v27, v18, v21, v24
+    st1         { v4.8h, v5.8h, v6.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v32_8_neon, export=1
+    load_epel_filterb x5, x4
+    sub         x1, x1, x2
+    mov         x10, 128
+    ld1         { v16.16b, v17.16b }, [x1], x2
+    ld1         { v18.16b, v19.16b }, [x1], x2
+    ld1         { v20.16b, v21.16b }, [x1], x2
+1:  ld1         { v22.16b, v23.16b }, [x1], x2
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    calc_epelb  v4, v16, v18, v20, v22
+    calc_epelb2 v5, v16, v18, v20, v22
+    calc_epelb  v6, v17, v19, v21, v23
+    calc_epelb2 v7, v17, v19, v21, v23
+    st1         { v4.8h, v5.8h, v6.8h, v7.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.16b, v17.16b }, [x1], x2
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    calc_epelb  v4, v18, v20, v22, v16
+    calc_epelb2 v5, v18, v20, v22, v16
+    calc_epelb  v6, v19, v21, v23, v17
+    calc_epelb2 v7, v19, v21, v23, v17
+    st1         { v4.8h, v5.8h, v6.8h, v7.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v18.16b, v19.16b }, [x1], x2
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    calc_epelb  v4, v20, v22, v16, v18
+    calc_epelb2 v5, v20, v22, v16, v18
+    calc_epelb  v6, v21, v23, v17, v19
+    calc_epelb2 v7, v21, v23, v17, v19
+    st1         { v4.8h, v5.8h, v6.8h, v7.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v20.16b, v21.16b }, [x1], x2
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    calc_epelb  v4, v22, v16, v18, v20
+    calc_epelb2 v5, v22, v16, v18, v20
+    calc_epelb  v6, v23, v17, v19, v21
+    calc_epelb2 v7, v23, v17, v19, v21
+    st1         { v4.8h, v5.8h, v6.8h, v7.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v48_8_neon, export=1
+    load_epel_filterb x5, x4
+    sub         x1, x1, x2
+    mov         x10, 64
+    ld1         { v16.16b, v17.16b, v18.16b }, [x1], x2
+    ld1         { v19.16b, v20.16b, v21.16b }, [x1], x2
+    ld1         { v22.16b, v23.16b, v24.16b }, [x1], x2
+1:  ld1         { v25.16b, v26.16b, v27.16b }, [x1], x2
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    movi        v28.8h, 0
+    movi        v29.8h, 0
+    calc_epelb  v4, v16, v19, v22, v25
+    calc_epelb2 v5, v16, v19, v22, v25
+    calc_epelb  v6, v17, v20, v23, v26
+    calc_epelb2 v7, v17, v20, v23, v26
+    calc_epelb  v28, v18, v21, v24, v27
+    calc_epelb2 v29, v18, v21, v24, v27
+    st1         { v4.8h, v5.8h, v6.8h, v7.8h }, [x0], 64
+    st1         { v28.8h, v29.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.16b, v17.16b, v18.16b }, [x1], x2
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    movi        v28.8h, 0
+    movi        v29.8h, 0
+    calc_epelb  v4, v19, v22, v25, v16
+    calc_epelb2 v5, v19, v22, v25, v16
+    calc_epelb  v6, v20, v23, v26, v17
+    calc_epelb2 v7, v20, v23, v26, v17
+    calc_epelb  v28, v21, v24, v27, v18
+    calc_epelb2 v29, v21, v24, v27, v18
+    st1         { v4.8h, v5.8h, v6.8h, v7.8h }, [x0], 64
+    st1         { v28.8h, v29.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v19.16b, v20.16b, v21.16b }, [x1], x2
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    movi        v28.8h, 0
+    movi        v29.8h, 0
+    calc_epelb  v4, v22, v25, v16, v19
+    calc_epelb2 v5, v22, v25, v16, v19
+    calc_epelb  v6, v23, v26, v17, v20
+    calc_epelb2 v7, v23, v26, v17, v20
+    calc_epelb  v28, v24, v27, v18, v21
+    calc_epelb2 v29, v24, v27, v18, v21
+    st1         { v4.8h, v5.8h, v6.8h, v7.8h }, [x0], 64
+    st1         { v28.8h, v29.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v22.16b, v23.16b, v24.16b }, [x1], x2
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    movi        v28.8h, 0
+    movi        v29.8h, 0
+    calc_epelb  v4, v25, v16, v19, v22
+    calc_epelb2 v5, v25, v16, v19, v22
+    calc_epelb  v6, v26, v17, v20, v23
+    calc_epelb2 v7, v26, v17, v20, v23
+    calc_epelb  v28, v27, v18, v21, v24
+    calc_epelb2 v29, v27, v18, v21, v24
+    st1         { v4.8h, v5.8h, v6.8h, v7.8h }, [x0], 64
+    st1         { v28.8h, v29.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v64_8_neon, export=1
+    load_epel_filterb x5, x4
+    sub         x1, x1, x2
+    ld1         { v16.16b, v17.16b, v18.16b, v19.16b }, [x1], x2
+    ld1         { v20.16b, v21.16b, v22.16b, v23.16b }, [x1], x2
+    ld1         { v24.16b, v25.16b, v26.16b, v27.16b }, [x1], x2
+1:  ld1         { v28.16b, v29.16b, v30.16b, v31.16b }, [x1], x2
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_epelb  v4, v16, v20, v24, v28
+    calc_epelb2 v5, v16, v20, v24, v28
+    calc_epelb  v6, v17, v21, v25, v29
+    calc_epelb2 v7, v17, v21, v25, v29
+    calc_epelb  v8, v18, v22, v26, v30
+    calc_epelb2 v9, v18, v22, v26, v30
+    calc_epelb  v10, v19, v23, v27, v31
+    calc_epelb2 v11, v19, v23, v27, v31
+    st1         { v4.8h, v5.8h, v6.8h, v7.8h }, [x0], 64
+    st1         { v8.8h, v9.8h, v10.8h, v11.8h }, [x0], 64
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.16b, v17.16b, v18.16b, v19.16b }, [x1], x2
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_epelb  v4, v20, v24, v28, v16
+    calc_epelb2 v5, v20, v24, v28, v16
+    calc_epelb  v6, v21, v25, v29, v17
+    calc_epelb2 v7, v21, v25, v29, v17
+    calc_epelb  v8, v22, v26, v30, v18
+    calc_epelb2 v9, v22, v26, v30, v18
+    calc_epelb  v10, v23, v27, v31, v19
+    calc_epelb2 v11, v23, v27, v31, v19
+    st1         { v4.8h, v5.8h, v6.8h, v7.8h }, [x0], 64
+    st1         { v8.8h, v9.8h, v10.8h, v11.8h }, [x0], 64
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v20.16b, v21.16b, v22.16b, v23.16b }, [x1], x2
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_epelb  v4, v24, v28, v16, v20
+    calc_epelb2 v5, v24, v28, v16, v20
+    calc_epelb  v6, v25, v29, v17, v21
+    calc_epelb2 v7, v25, v29, v17, v21
+    calc_epelb  v8, v26, v30, v18, v22
+    calc_epelb2 v9, v26, v30, v18, v22
+    calc_epelb  v10, v27, v31, v19, v23
+    calc_epelb2 v11, v27, v31, v19, v23
+    st1         { v4.8h, v5.8h, v6.8h, v7.8h }, [x0], 64
+    st1         { v8.8h, v9.8h, v10.8h, v11.8h }, [x0], 64
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v24.16b, v25.16b, v26.16b, v27.16b }, [x1], x2
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_epelb  v4, v28, v16, v20, v24
+    calc_epelb2 v5, v28, v16, v20, v24
+    calc_epelb  v6, v29, v17, v21, v25
+    calc_epelb2 v7, v29, v17, v21, v25
+    calc_epelb  v8, v30, v18, v22, v26
+    calc_epelb2 v9, v30, v18, v22, v26
+    calc_epelb  v10, v31, v19, v23, v27
+    calc_epelb2 v11, v31, v19, v23, v27
+    st1         { v4.8h, v5.8h, v6.8h, v7.8h }, [x0], 64
+    st1         { v8.8h, v9.8h, v10.8h, v11.8h }, [x0], 64
+    subs        x3, x3, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+
+
+
+function ff_hevc_put_hevc_epel_hv4_8_neon, export=1
+    add         x10, x3, 3
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x3, [sp, -16]!
+    stp         x5, x30, [sp, -16]!
+    add         x0, sp, 32
+    sub         x1, x1, x2
+    add         x3, x3, 3
+    bl          ff_hevc_put_hevc_epel_h4_8_neon
+    ldp         x5, x30, [sp], 16
+    ldp         x0, x3, [sp], 16
+    load_epel_filterh x5, x4
+    mov         x10, 128
+    ld1         { v16.4h }, [sp], x10
+    ld1         { v17.4h }, [sp], x10
+    ld1         { v18.4h }, [sp], x10
+1:  ld1         { v19.4h }, [sp], x10
+    calc_epelh  v4, v16, v17, v18, v19
+    st1         { v4.4h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.4h }, [sp], x10
+    calc_epelh  v4, v17, v18, v19, v16
+    st1         { v4.4h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v17.4h }, [sp], x10
+    calc_epelh  v4, v18, v19, v16, v17
+    st1         { v4.4h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v18.4h }, [sp], x10
+    calc_epelh  v4, v19, v16, v17, v18
+    st1         { v4.4h }, [x0], x10
+    subs        x3, x3, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv6_8_neon, export=1
+    add         x10, x3, 3
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x3, [sp, -16]!
+    stp         x5, x30, [sp, -16]!
+    add         x0, sp, 32
+    sub         x1, x1, x2
+    add         x3, x3, 3
+    bl          ff_hevc_put_hevc_epel_h6_8_neon
+    ldp         x5, x30, [sp], 16
+    ldp         x0, x3, [sp], 16
+    load_epel_filterh x5, x4
+    mov         x5, 120
+    mov         x10, 128
+    ld1         { v16.8h }, [sp], x10
+    ld1         { v17.8h }, [sp], x10
+    ld1         { v18.8h }, [sp], x10
+1:  ld1         { v19.8h }, [sp], x10
+    calc_epelh  v4, v16, v17, v18, v19
+    calc_epelh2 v4, v5, v16, v17, v18, v19
+    st1         { v4.d }[0], [x0], 8
+    st1         { v4.s }[2], [x0], x5
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.8h }, [sp], x10
+    calc_epelh  v4, v17, v18, v19, v16
+    calc_epelh2 v4, v5, v17, v18, v19, v16
+    st1         { v4.d }[0], [x0], 8
+    st1         { v4.s }[2], [x0], x5
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v17.8h }, [sp], x10
+    calc_epelh  v4, v18, v19, v16, v17
+    calc_epelh2 v4, v5, v18, v19, v16, v17
+    st1         { v4.d }[0], [x0], 8
+    st1         { v4.s }[2], [x0], x5
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v18.8h }, [sp], x10
+    calc_epelh  v4, v19, v16, v17, v18
+    calc_epelh2 v4, v5, v19, v16, v17, v18
+    st1         { v4.d }[0], [x0], 8
+    st1         { v4.s }[2], [x0], x5
+    subs        x3, x3, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv8_8_neon, export=1
+    add         x10, x3, 3
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x3, [sp, -16]!
+    stp         x5, x30, [sp, -16]!
+    add         x0, sp, 32
+    sub         x1, x1, x2
+    add         x3, x3, 3
+    bl          ff_hevc_put_hevc_epel_h8_8_neon
+    ldp         x5, x30, [sp], 16
+    ldp         x0, x3, [sp], 16
+    load_epel_filterh x5, x4
+    mov         x10, 128
+    ld1         { v16.8h }, [sp], x10
+    ld1         { v17.8h }, [sp], x10
+    ld1         { v18.8h }, [sp], x10
+1:  ld1         { v19.8h }, [sp], x10
+    calc_epelh  v4, v16, v17, v18, v19
+    calc_epelh2 v4, v5, v16, v17, v18, v19
+    st1         { v4.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.8h }, [sp], x10
+    calc_epelh  v4, v17, v18, v19, v16
+    calc_epelh2 v4, v5, v17, v18, v19, v16
+    st1         { v4.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v17.8h }, [sp], x10
+    calc_epelh  v4, v18, v19, v16, v17
+    calc_epelh2 v4, v5, v18, v19, v16, v17
+    st1         { v4.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v18.8h }, [sp], x10
+    calc_epelh  v4, v19, v16, v17, v18
+    calc_epelh2 v4, v5, v19, v16, v17, v18
+    st1         { v4.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv12_8_neon, export=1
+    add         x10, x3, 3
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x3, [sp, -16]!
+    stp         x5, x30, [sp, -16]!
+    add         x0, sp, 32
+    sub         x1, x1, x2
+    add         x3, x3, 3
+    bl          ff_hevc_put_hevc_epel_h12_8_neon
+    ldp         x5, x30, [sp], 16
+    ldp         x0, x3, [sp], 16
+    load_epel_filterh x5, x4
+    mov         x5, 112
+    mov         x10, 128
+    ld1         { v16.8h, v17.8h }, [sp], x10
+    ld1         { v18.8h, v19.8h }, [sp], x10
+    ld1         { v20.8h, v21.8h }, [sp], x10
+1:  ld1         { v22.8h, v23.8h }, [sp], x10
+    calc_epelh  v4, v16, v18, v20, v22
+    calc_epelh2 v4, v5, v16, v18, v20, v22
+    calc_epelh  v5, v17, v19, v21, v23
+    st1         { v4.8h }, [x0], 16
+    st1         { v5.4h }, [x0], x5
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.8h, v17.8h }, [sp], x10
+    calc_epelh  v4, v18, v20, v22, v16
+    calc_epelh2 v4, v5, v18, v20, v22, v16
+    calc_epelh  v5, v19, v21, v23, v17
+    st1         { v4.8h }, [x0], 16
+    st1         { v5.4h }, [x0], x5
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v18.8h, v19.8h }, [sp], x10
+    calc_epelh  v4, v20, v22, v16, v18
+    calc_epelh2 v4, v5, v20, v22, v16, v18
+    calc_epelh  v5, v21, v23, v17, v19
+    st1         { v4.8h }, [x0], 16
+    st1         { v5.4h }, [x0], x5
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v20.8h, v21.8h }, [sp], x10
+    calc_epelh  v4, v22, v16, v18, v20
+    calc_epelh2 v4, v5, v22, v16, v18, v20
+    calc_epelh  v5, v23, v17, v19, v21
+    st1         { v4.8h }, [x0], 16
+    st1         { v5.4h }, [x0], x5
+    subs        x3, x3, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv16_8_neon, export=1
+    add         x10, x3, 3
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x3, [sp, -16]!
+    stp         x5, x30, [sp, -16]!
+    add         x0, sp, 32
+    sub         x1, x1, x2
+    add         x3, x3, 3
+    bl          ff_hevc_put_hevc_epel_h16_8_neon
+    ldp         x5, x30, [sp], 16
+    ldp         x0, x3, [sp], 16
+    load_epel_filterh x5, x4
+    mov         x10, 128
+    ld1         { v16.8h, v17.8h }, [sp], x10
+    ld1         { v18.8h, v19.8h }, [sp], x10
+    ld1         { v20.8h, v21.8h }, [sp], x10
+1:  ld1         { v22.8h, v23.8h }, [sp], x10
+    calc_epelh  v4, v16, v18, v20, v22
+    calc_epelh2 v4, v5, v16, v18, v20, v22
+    calc_epelh  v5, v17, v19, v21, v23
+    calc_epelh2 v5, v6, v17, v19, v21, v23
+    st1         { v4.8h, v5.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.8h, v17.8h }, [sp], x10
+    calc_epelh  v4, v18, v20, v22, v16
+    calc_epelh2 v4, v5, v18, v20, v22, v16
+    calc_epelh  v5, v19, v21, v23, v17
+    calc_epelh2 v5, v6, v19, v21, v23, v17
+    st1         { v4.8h, v5.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v18.8h, v19.8h }, [sp], x10
+    calc_epelh  v4, v20, v22, v16, v18
+    calc_epelh2 v4, v5, v20, v22, v16, v18
+    calc_epelh  v5, v21, v23, v17, v19
+    calc_epelh2 v5, v6, v21, v23, v17, v19
+    st1         { v4.8h, v5.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v20.8h, v21.8h }, [sp], x10
+    calc_epelh  v4, v22, v16, v18, v20
+    calc_epelh2 v4, v5, v22, v16, v18, v20
+    calc_epelh  v5, v23, v17, v19, v21
+    calc_epelh2 v5, v6, v23, v17, v19, v21
+    st1         { v4.8h, v5.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv24_8_neon, export=1
+    add         x10, x3, 3
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x3, [sp, -16]!
+    stp         x5, x30, [sp, -16]!
+    add         x0, sp, 32
+    sub         x1, x1, x2
+    add         x3, x3, 3
+    bl          ff_hevc_put_hevc_epel_h24_8_neon
+    ldp         x5, x30, [sp], 16
+    ldp         x0, x3, [sp], 16
+    load_epel_filterh x5, x4
+    mov         x10, 128
+    ld1         { v16.8h, v17.8h, v18.8h }, [sp], x10
+    ld1         { v19.8h, v20.8h, v21.8h }, [sp], x10
+    ld1         { v22.8h, v23.8h, v24.8h }, [sp], x10
+1:  ld1         { v25.8h, v26.8h, v27.8h }, [sp], x10
+    calc_epelh  v4, v16, v19, v22, v25
+    calc_epelh2 v4, v5, v16, v19, v22, v25
+    calc_epelh  v5, v17, v20, v23, v26
+    calc_epelh2 v5, v6, v17, v20, v23, v26
+    calc_epelh  v6, v18, v21, v24, v27
+    calc_epelh2 v6, v7, v18, v21, v24, v27
+    st1         { v4.8h, v5.8h, v6.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.8h, v17.8h, v18.8h }, [sp], x10
+    calc_epelh  v4, v19, v22, v25, v16
+    calc_epelh2 v4, v5, v19, v22, v25, v16
+    calc_epelh  v5, v20, v23, v26, v17
+    calc_epelh2 v5, v6, v20, v23, v26, v17
+    calc_epelh  v6, v21, v24, v27, v18
+    calc_epelh2 v6, v7, v21, v24, v27, v18
+    st1         { v4.8h, v5.8h, v6.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v19.8h, v20.8h, v21.8h }, [sp], x10
+    calc_epelh  v4, v22, v25, v16, v19
+    calc_epelh2 v4, v5, v22, v25, v16, v19
+    calc_epelh  v5, v23, v26, v17, v20
+    calc_epelh2 v5, v6, v23, v26, v17, v20
+    calc_epelh  v6, v24, v27, v18, v21
+    calc_epelh2 v6, v7, v24, v27, v18, v21
+    st1         { v4.8h, v5.8h, v6.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v22.8h, v23.8h, v24.8h }, [sp], x10
+    calc_epelh  v4, v25, v16, v19, v22
+    calc_epelh2 v4, v5, v25, v16, v19, v22
+    calc_epelh  v5, v26, v17, v20, v23
+    calc_epelh2 v5, v6, v26, v17, v20, v23
+    calc_epelh  v6, v27, v18, v21, v24
+    calc_epelh2 v6, v7, v27, v18, v21, v24
+    st1         { v4.8h, v5.8h, v6.8h }, [x0], x10
+    subs        x3, x3, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv32_8_neon, export=1
+    stp         xzr, x30, [sp, -16]!
+    stp         x0, x1, [sp, -16]!
+    stp         x2, x3, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    mov         x6, 16
+    bl          ff_hevc_put_hevc_epel_hv16_8_neon
+    ldp         x4, x5, [sp], 16
+    ldp         x2, x3, [sp], 16
+    ldp         x0, x1, [sp], 16
+    add         x0, x0, 32
+    add         x1, x1, 16
+    mov         x6, 16
+    bl          ff_hevc_put_hevc_epel_hv16_8_neon
+    ldp         xzr, x30, [sp], 16
+    ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv48_8_neon, export=1
+    stp         xzr, x30, [sp, -16]!
+    stp         x0, x1, [sp, -16]!
+    stp         x2, x3, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    mov         x6, 24
+    bl          ff_hevc_put_hevc_epel_hv24_8_neon
+    ldp         x4, x5, [sp], 16
+    ldp         x2, x3, [sp], 16
+    ldp         x0, x1, [sp], 16
+    add         x0, x0, 48
+    add         x1, x1, 24
+    mov         x6, 24
+    bl          ff_hevc_put_hevc_epel_hv24_8_neon
+    ldp         xzr, x30, [sp], 16
+    ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv64_8_neon, export=1
+    stp         xzr, x30, [sp, -16]!
+    stp         x0, x1, [sp, -16]!
+    stp         x2, x3, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    mov         x6, 16
+    bl          ff_hevc_put_hevc_epel_hv16_8_neon
+    ldp         x4, x5, [sp]
+    ldp         x2, x3, [sp, 16]
+    ldp         x0, x1, [sp, 32]
+    add         x0, x0, 32
+    add         x1, x1, 16
+    mov         x6, 16
+    bl          ff_hevc_put_hevc_epel_hv16_8_neon
+    ldp         x4, x5, [sp]
+    ldp         x2, x3, [sp, 16]
+    ldp         x0, x1, [sp, 32]
+    add         x0, x0, 64
+    add         x1, x1, 32
+    mov         x6, 16
+    bl          ff_hevc_put_hevc_epel_hv16_8_neon
+    ldp         x4, x5, [sp], 16
+    ldp         x2, x3, [sp], 16
+    ldp         x0, x1, [sp], 16
+    add         x0, x0, 96
+    add         x1, x1, 48
+    mov         x6, 16
+    bl          ff_hevc_put_hevc_epel_hv16_8_neon
+    ldp         xzr, x30, [sp], 16
+    ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v4_8_neon, export=1
+    load_epel_filterb x6, x5
+    sub         x2, x2, x3
+    ld1         { v16.s }[0], [x2], x3
+    ld1         { v17.s }[0], [x2], x3
+    ld1         { v18.s }[0], [x2], x3
+1:  ld1         { v19.s }[0], [x2], x3
+    movi        v4.8h, 0
+    calc_epelb  v4, v16, v17, v18, v19
+    sqrshrun    v4.8b,  v4.8h, 6
+    st1         { v4.s }[0], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v16.s }[0], [x2], x3
+    movi        v4.8h, 0
+    calc_epelb  v4, v17, v18, v19, v16
+    sqrshrun    v4.8b,  v4.8h, 6
+    st1         { v4.s }[0], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v17.s }[0], [x2], x3
+    movi        v4.8h, 0
+    calc_epelb  v4, v18, v19, v16, v17
+    sqrshrun    v4.8b,  v4.8h, 6
+    st1         { v4.s }[0], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v18.s }[0], [x2], x3
+    movi        v4.8h, 0
+    calc_epelb  v4, v19, v16, v17, v18
+    sqrshrun    v4.8b,  v4.8h, 6
+    st1         { v4.s }[0], [x0], x1
+    subs        x4, x4, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v6_8_neon, export=1
+    load_epel_filterb x6, x5
+    sub         x2, x2, x3
+    sub         x1, x1, 4
+    ld1         { v16.8b }, [x2], x3
+    ld1         { v17.8b }, [x2], x3
+    ld1         { v18.8b }, [x2], x3
+1:  ld1         { v19.8b }, [x2], x3
+    movi        v4.8h, 0
+    calc_epelb  v4, v16, v17, v18, v19
+    sqrshrun    v4.8b,  v4.8h, 6
+    st1         { v4.s }[0], [x0], 4
+    st1         { v4.h }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v16.8b }, [x2], x3
+    movi        v4.8h, 0
+    calc_epelb  v4, v17, v18, v19, v16
+    sqrshrun    v4.8b,  v4.8h, 6
+    st1         { v4.s }[0], [x0], 4
+    st1         { v4.h }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v17.8b }, [x2], x3
+    movi        v4.8h, 0
+    calc_epelb  v4, v18, v19, v16, v17
+    sqrshrun    v4.8b,  v4.8h, 6
+    st1         { v4.s }[0], [x0], 4
+    st1         { v4.h }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v18.8b }, [x2], x3
+    movi        v4.8h, 0
+    calc_epelb  v4, v19, v16, v17, v18
+    sqrshrun    v4.8b,  v4.8h, 6
+    st1         { v4.s }[0], [x0], 4
+    st1         { v4.h }[2], [x0], x1
+    subs        x4, x4, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v8_8_neon, export=1
+    load_epel_filterb x6, x5
+    sub         x2, x2, x3
+    ld1         { v16.8b }, [x2], x3
+    ld1         { v17.8b }, [x2], x3
+    ld1         { v18.8b }, [x2], x3
+1:  ld1         { v19.8b }, [x2], x3
+    movi        v4.8h, 0
+    calc_epelb  v4, v16, v17, v18, v19
+    sqrshrun    v4.8b,  v4.8h, 6
+    st1         { v4.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v16.8b }, [x2], x3
+    movi        v4.8h, 0
+    calc_epelb  v4, v17, v18, v19, v16
+    sqrshrun    v4.8b,  v4.8h, 6
+    st1         { v4.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v17.8b }, [x2], x3
+    movi        v4.8h, 0
+    calc_epelb  v4, v18, v19, v16, v17
+    sqrshrun    v4.8b,  v4.8h, 6
+    st1         { v4.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v18.8b }, [x2], x3
+    movi        v4.8h, 0
+    calc_epelb  v4, v19, v16, v17, v18
+    sqrshrun    v4.8b,  v4.8h, 6
+    st1         { v4.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v12_8_neon, export=1
+    load_epel_filterb x6, x5
+    sub         x2, x2, x3
+    sub         x1, x1, 8
+    ld1         { v16.16b }, [x2], x3
+    ld1         { v17.16b }, [x2], x3
+    ld1         { v18.16b }, [x2], x3
+1:  ld1         { v19.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    calc_epelb  v4, v16, v17, v18, v19
+    calc_epelb2 v5, v16, v17, v18, v19
+    sqrshrun    v4.8b,  v4.8h, 6
+    sqrshrun2   v4.16b,  v5.8h, 6
+    st1         { v4.8b }, [x0], 8
+    st1         { v4.s }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v16.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    calc_epelb  v4, v17, v18, v19, v16
+    calc_epelb2 v5, v17, v18, v19, v16
+    sqrshrun    v4.8b,  v4.8h, 6
+    sqrshrun2   v4.16b,  v5.8h, 6
+    st1         { v4.8b }, [x0], 8
+    st1         { v4.s }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v17.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    calc_epelb  v4, v18, v19, v16, v17
+    calc_epelb2 v5, v18, v19, v16, v17
+    sqrshrun    v4.8b,  v4.8h, 6
+    sqrshrun2   v4.16b,  v5.8h, 6
+    st1         { v4.8b }, [x0], 8
+    st1         { v4.s }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v18.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    calc_epelb  v4, v19, v16, v17, v18
+    calc_epelb2 v5, v19, v16, v17, v18
+    sqrshrun    v4.8b,  v4.8h, 6
+    sqrshrun2   v4.16b,  v5.8h, 6
+    st1         { v4.8b }, [x0], 8
+    st1         { v4.s }[2], [x0], x1
+    subs        x4, x4, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v16_8_neon, export=1
+    load_epel_filterb x6, x5
+    sub         x2, x2, x3
+    ld1         { v16.16b }, [x2], x3
+    ld1         { v17.16b }, [x2], x3
+    ld1         { v18.16b }, [x2], x3
+1:  ld1         { v19.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    calc_epelb  v4, v16, v17, v18, v19
+    calc_epelb2 v5, v16, v17, v18, v19
+    sqrshrun    v4.8b,  v4.8h, 6
+    sqrshrun2   v4.16b,  v5.8h, 6
+    st1         { v4.16b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v16.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    calc_epelb  v4, v17, v18, v19, v16
+    calc_epelb2 v5, v17, v18, v19, v16
+    sqrshrun    v4.8b,  v4.8h, 6
+    sqrshrun2   v4.16b,  v5.8h, 6
+    st1         { v4.16b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v17.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    calc_epelb  v4, v18, v19, v16, v17
+    calc_epelb2 v5, v18, v19, v16, v17
+    sqrshrun    v4.8b,  v4.8h, 6
+    sqrshrun2   v4.16b,  v5.8h, 6
+    st1         { v4.16b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v18.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    calc_epelb  v4, v19, v16, v17, v18
+    calc_epelb2 v5, v19, v16, v17, v18
+    sqrshrun    v4.8b,  v4.8h, 6
+    sqrshrun2   v4.16b,  v5.8h, 6
+    st1         { v4.16b }, [x0], x1
+    subs        x4, x4, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v24_8_neon, export=1
+    load_epel_filterb x6, x5
+    sub         x2, x2, x3
+    ld1         { v16.8b, v17.8b, v18.8b }, [x2], x3
+    ld1         { v19.8b, v20.8b, v21.8b }, [x2], x3
+    ld1         { v22.8b, v23.8b, v24.8b }, [x2], x3
+1:  ld1         { v25.8b, v26.8b, v27.8b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    calc_epelb  v4, v16, v19, v22, v25
+    calc_epelb  v5, v17, v20, v23, v26
+    calc_epelb  v6, v18, v21, v24, v27
+    sqrshrun    v4.8b,  v4.8h, 6
+    sqrshrun    v5.8b,  v5.8h, 6
+    sqrshrun    v6.8b,  v6.8h, 6
+    st1         { v4.8b, v5.8b, v6.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v16.8b, v17.8b, v18.8b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    calc_epelb  v4, v19, v22, v25, v16
+    calc_epelb  v5, v20, v23, v26, v17
+    calc_epelb  v6, v21, v24, v27, v18
+    sqrshrun    v4.8b,  v4.8h, 6
+    sqrshrun    v5.8b,  v5.8h, 6
+    sqrshrun    v6.8b,  v6.8h, 6
+    st1         { v4.8b, v5.8b, v6.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v19.8b, v20.8b, v21.8b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    calc_epelb  v4, v22, v25, v16, v19
+    calc_epelb  v5, v23, v26, v17, v20
+    calc_epelb  v6, v24, v27, v18, v21
+    sqrshrun    v4.8b,  v4.8h, 6
+    sqrshrun    v5.8b,  v5.8h, 6
+    sqrshrun    v6.8b,  v6.8h, 6
+    st1         { v4.8b, v5.8b, v6.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v22.8b, v23.8b, v24.8b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    calc_epelb  v4, v25, v16, v19, v22
+    calc_epelb  v5, v26, v17, v20, v23
+    calc_epelb  v6, v27, v18, v21, v24
+    sqrshrun    v4.8b,  v4.8h, 6
+    sqrshrun    v5.8b,  v5.8h, 6
+    sqrshrun    v6.8b,  v6.8h, 6
+    st1         { v4.8b, v5.8b, v6.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v32_8_neon, export=1
+    load_epel_filterb x6, x5
+    sub         x2, x2, x3
+    ld1         { v16.16b, v17.16b }, [x2], x3
+    ld1         { v18.16b, v19.16b }, [x2], x3
+    ld1         { v20.16b, v21.16b }, [x2], x3
+1:  ld1         { v22.16b, v23.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    calc_epelb  v4, v16, v18, v20, v22
+    calc_epelb2 v5, v16, v18, v20, v22
+    calc_epelb  v6, v17, v19, v21, v23
+    calc_epelb2 v7, v17, v19, v21, v23
+    sqrshrun    v4.8b,  v4.8h, 6
+    sqrshrun2   v4.16b,  v5.8h, 6
+    sqrshrun    v5.8b,  v6.8h, 6
+    sqrshrun2   v5.16b,  v7.8h, 6
+    st1         { v4.16b, v5.16b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v16.16b, v17.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    calc_epelb  v4, v18, v20, v22, v16
+    calc_epelb2 v5, v18, v20, v22, v16
+    calc_epelb  v6, v19, v21, v23, v17
+    calc_epelb2 v7, v19, v21, v23, v17
+    sqrshrun    v4.8b,  v4.8h, 6
+    sqrshrun2   v4.16b,  v5.8h, 6
+    sqrshrun    v5.8b,  v6.8h, 6
+    sqrshrun2   v5.16b,  v7.8h, 6
+    st1         { v4.16b, v5.16b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v18.16b, v19.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    calc_epelb  v4, v20, v22, v16, v18
+    calc_epelb2 v5, v20, v22, v16, v18
+    calc_epelb  v6, v21, v23, v17, v19
+    calc_epelb2 v7, v21, v23, v17, v19
+    sqrshrun    v4.8b,  v4.8h, 6
+    sqrshrun2   v4.16b,  v5.8h, 6
+    sqrshrun    v5.8b,  v6.8h, 6
+    sqrshrun2   v5.16b,  v7.8h, 6
+    st1         { v4.16b, v5.16b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v20.16b, v21.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    calc_epelb  v4, v22, v16, v18, v20
+    calc_epelb2 v5, v22, v16, v18, v20
+    calc_epelb  v6, v23, v17, v19, v21
+    calc_epelb2 v7, v23, v17, v19, v21
+    sqrshrun    v4.8b,  v4.8h, 6
+    sqrshrun2   v4.16b,  v5.8h, 6
+    sqrshrun    v5.8b,  v6.8h, 6
+    sqrshrun2   v5.16b,  v7.8h, 6
+    st1         { v4.16b, v5.16b }, [x0], x1
+    subs        x4, x4, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+
+function ff_hevc_put_hevc_epel_uni_v48_8_neon, export=1
+    load_epel_filterb x6, x5
+    sub         x2, x2, x3
+    ld1         { v16.16b, v17.16b, v18.16b }, [x2], x3
+    ld1         { v19.16b, v20.16b, v21.16b }, [x2], x3
+    ld1         { v22.16b, v23.16b, v24.16b }, [x2], x3
+1:  ld1         { v25.16b, v26.16b, v27.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    movi        v28.8h, 0
+    movi        v29.8h, 0
+    calc_epelb  v4, v16, v19, v22, v25
+    calc_epelb2 v5, v16, v19, v22, v25
+    calc_epelb  v6, v17, v20, v23, v26
+    calc_epelb2 v7, v17, v20, v23, v26
+    calc_epelb  v28, v18, v21, v24, v27
+    calc_epelb2 v29, v18, v21, v24, v27
+    sqrshrun    v4.8b,  v4.8h, 6
+    sqrshrun2   v4.16b,  v5.8h, 6
+    sqrshrun    v5.8b,  v6.8h, 6
+    sqrshrun2   v5.16b,  v7.8h, 6
+    sqrshrun    v6.8b,  v28.8h, 6
+    sqrshrun2   v6.16b,  v29.8h, 6
+    st1         { v4.16b, v5.16b, v6.16b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v16.16b, v17.16b, v18.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    movi        v28.8h, 0
+    movi        v29.8h, 0
+    calc_epelb  v28, v21, v24, v27, v18
+    calc_epelb2 v29, v21, v24, v27, v18
+    calc_epelb  v4, v19, v22, v25, v16
+    calc_epelb2 v5, v19, v22, v25, v16
+    calc_epelb  v6, v20, v23, v26, v17
+    calc_epelb2 v7, v20, v23, v26, v17
+    sqrshrun    v4.8b,  v4.8h, 6
+    sqrshrun2   v4.16b,  v5.8h, 6
+    sqrshrun    v5.8b,  v6.8h, 6
+    sqrshrun2   v5.16b,  v7.8h, 6
+    sqrshrun    v6.8b,  v28.8h, 6
+    sqrshrun2   v6.16b,  v29.8h, 6
+    st1         { v4.16b, v5.16b, v6.16b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v19.16b, v20.16b, v21.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    movi        v28.8h, 0
+    movi        v29.8h, 0
+    calc_epelb  v28, v24, v27, v18, v21
+    calc_epelb2 v29, v24, v27, v18, v21
+    calc_epelb  v4, v22, v25, v16, v19
+    calc_epelb2 v5, v22, v25, v16, v19
+    calc_epelb  v6, v23, v26, v17, v20
+    calc_epelb2 v7, v23, v26, v17, v20
+    sqrshrun    v4.8b,  v4.8h, 6
+    sqrshrun2   v4.16b,  v5.8h, 6
+    sqrshrun    v5.8b,  v6.8h, 6
+    sqrshrun2   v5.16b,  v7.8h, 6
+    sqrshrun    v6.8b,  v28.8h, 6
+    sqrshrun2   v6.16b,  v29.8h, 6
+    st1         { v4.16b, v5.16b, v6.16b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v22.16b, v23.16b, v24.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    movi        v28.8h, 0
+    movi        v29.8h, 0
+    calc_epelb  v28, v27, v18, v21, v24
+    calc_epelb2 v29, v27, v18, v21, v24
+    calc_epelb  v4, v25, v16, v19, v22
+    calc_epelb2 v5, v25, v16, v19, v22
+    calc_epelb  v6, v26, v17, v20, v23
+    calc_epelb2 v7, v26, v17, v20, v23
+    sqrshrun    v4.8b,  v4.8h, 6
+    sqrshrun2   v4.16b,  v5.8h, 6
+    sqrshrun    v5.8b,  v6.8h, 6
+    sqrshrun2   v5.16b,  v7.8h, 6
+    sqrshrun    v6.8b,  v28.8h, 6
+    sqrshrun2   v6.16b,  v29.8h, 6
+    st1         { v4.16b, v5.16b, v6.16b }, [x0], x1
+    subs        x4, x4, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v64_8_neon, export=1
+    load_epel_filterb x6, x5
+    sub         x2, x2, x3
+    ld1         { v16.16b, v17.16b, v18.16b, v19.16b }, [x2], x3
+    ld1         { v20.16b, v21.16b, v22.16b, v23.16b }, [x2], x3
+    ld1         { v24.16b, v25.16b, v26.16b, v27.16b }, [x2], x3
+1:  ld1         { v28.16b, v29.16b, v30.16b, v31.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_epelb  v10, v19, v23, v27, v31
+    calc_epelb2 v11, v19, v23, v27, v31
+    calc_epelb  v4, v16, v20, v24, v28
+    calc_epelb2 v5, v16, v20, v24, v28
+    calc_epelb  v6, v17, v21, v25, v29
+    calc_epelb2 v7, v17, v21, v25, v29
+    calc_epelb  v8, v18, v22, v26, v30
+    calc_epelb2 v9, v18, v22, v26, v30
+    sqrshrun    v4.8b,  v4.8h, 6
+    sqrshrun2   v4.16b,  v5.8h, 6
+    sqrshrun    v5.8b,  v6.8h, 6
+    sqrshrun2   v5.16b,  v7.8h, 6
+    sqrshrun    v6.8b,  v8.8h, 6
+    sqrshrun2   v6.16b,  v9.8h, 6
+    sqrshrun    v7.8b,  v10.8h, 6
+    sqrshrun2   v7.16b,  v11.8h, 6
+    st1         { v4.16b, v5.16b, v6.16b, v7.16b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v16.16b, v17.16b, v18.16b, v19.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_epelb  v10, v23, v27, v31, v19
+    calc_epelb2 v11, v23, v27, v31, v19
+    calc_epelb  v4, v20, v24, v28, v16
+    calc_epelb2 v5, v20, v24, v28, v16
+    calc_epelb  v6, v21, v25, v29, v17
+    calc_epelb2 v7, v21, v25, v29, v17
+    calc_epelb  v8, v22, v26, v30, v18
+    calc_epelb2 v9, v22, v26, v30, v18
+    sqrshrun    v4.8b,  v4.8h, 6
+    sqrshrun2   v4.16b,  v5.8h, 6
+    sqrshrun    v5.8b,  v6.8h, 6
+    sqrshrun2   v5.16b,  v7.8h, 6
+    sqrshrun    v6.8b,  v8.8h, 6
+    sqrshrun2   v6.16b,  v9.8h, 6
+    sqrshrun    v7.8b,  v10.8h, 6
+    sqrshrun2   v7.16b,  v11.8h, 6
+    st1         { v4.16b, v5.16b, v6.16b, v7.16b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v20.16b, v21.16b, v22.16b, v23.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_epelb  v10, v27, v31, v19, v23
+    calc_epelb2 v11, v27, v31, v19, v23
+    calc_epelb  v4, v24, v28, v16, v20
+    calc_epelb2 v5, v24, v28, v16, v20
+    calc_epelb  v6, v25, v29, v17, v21
+    calc_epelb2 v7, v25, v29, v17, v21
+    calc_epelb  v8, v26, v30, v18, v22
+    calc_epelb2 v9, v26, v30, v18, v22
+    sqrshrun    v4.8b,  v4.8h, 6
+    sqrshrun2   v4.16b,  v5.8h, 6
+    sqrshrun    v5.8b,  v6.8h, 6
+    sqrshrun2   v5.16b,  v7.8h, 6
+    sqrshrun    v6.8b,  v8.8h, 6
+    sqrshrun2   v6.16b,  v9.8h, 6
+    sqrshrun    v7.8b,  v10.8h, 6
+    sqrshrun2   v7.16b,  v11.8h, 6
+    st1         { v4.16b, v5.16b, v6.16b, v7.16b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v24.16b, v25.16b, v26.16b, v27.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_epelb  v10, v31, v19, v23, v27
+    calc_epelb2 v11, v31, v19, v23, v27
+    calc_epelb  v4, v28, v16, v20, v24
+    calc_epelb2 v5, v28, v16, v20, v24
+    calc_epelb  v6, v29, v17, v21, v25
+    calc_epelb2 v7, v29, v17, v21, v25
+    calc_epelb  v8, v30, v18, v22, v26
+    calc_epelb2 v9, v30, v18, v22, v26
+    sqrshrun    v4.8b,  v4.8h, 6
+    sqrshrun2   v4.16b,  v5.8h, 6
+    sqrshrun    v5.8b,  v6.8h, 6
+    sqrshrun2   v5.16b,  v7.8h, 6
+    sqrshrun    v6.8b,  v8.8h, 6
+    sqrshrun2   v6.16b,  v9.8h, 6
+    sqrshrun    v7.8b,  v10.8h, 6
+    sqrshrun2   v7.16b,  v11.8h, 6
+    st1         { v4.16b, v5.16b, v6.16b, v7.16b }, [x0], x1
+    subs        x4, x4, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_epel_uni_hv4_8_neon, export=1
+    add         x10, x4, 3
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x6, [sp, -16]!
+    stp         xzr, x30, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3
+    mov         x2, x3
+    add         x3, x4, 3
+    mov         x4, x5
+    bl          ff_hevc_put_hevc_epel_h4_8_neon
+    ldp         xzr, x30, [sp], 16
+    ldp         x4, x6, [sp], 16
+    ldp         x0, x1, [sp], 16
+    load_epel_filterh x6, x5
+    mov         x10, 128
+    ld1         { v16.4h }, [sp], x10
+    ld1         { v17.4h }, [sp], x10
+    ld1         { v18.4h }, [sp], x10
+1:  ld1         { v19.4h }, [sp], x10
+    calc_epelh  v4, v16, v17, v18, v19
+    sqrshrun    v4.8b, v4.8h, 6
+    st1         { v4.s }[0], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v16.4h }, [sp], x10
+    calc_epelh  v4, v17, v18, v19, v16
+    sqrshrun    v4.8b, v4.8h, 6
+    st1         { v4.s }[0], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v17.4h }, [sp], x10
+    calc_epelh  v4, v18, v19, v16, v17
+    sqrshrun    v4.8b, v4.8h, 6
+    st1         { v4.s }[0], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v18.4h }, [sp], x10
+    calc_epelh  v4, v19, v16, v17, v18
+    sqrshrun    v4.8b, v4.8h, 6
+    st1         { v4.s }[0], [x0], x1
+    subs        x4, x4, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv6_8_neon, export=1
+    add         x10, x4, 3
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x6, [sp, -16]!
+    stp         xzr, x30, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3
+    mov         x2, x3
+    add         x3, x4, 3
+    mov         x4, x5
+    bl          ff_hevc_put_hevc_epel_h6_8_neon
+    ldp         xzr, x30, [sp], 16
+    ldp         x4, x6, [sp], 16
+    ldp         x0, x1, [sp], 16
+    load_epel_filterh x6, x5
+    sub         x1, x1, 4
+    mov         x10, 128
+    ld1         { v16.8h }, [sp], x10
+    ld1         { v17.8h }, [sp], x10
+    ld1         { v18.8h }, [sp], x10
+1:  ld1         { v19.8h }, [sp], x10
+    calc_epelh  v4, v16, v17, v18, v19
+    calc_epelh2 v4, v5, v16, v17, v18, v19
+    sqrshrun    v4.8b, v4.8h, 6
+    st1         { v4.s }[0], [x0], 4
+    st1         { v4.h }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v16.8h }, [sp], x10
+    calc_epelh  v4, v17, v18, v19, v16
+    calc_epelh2 v4, v5, v17, v18, v19, v16
+    sqrshrun    v4.8b, v4.8h, 6
+    st1         { v4.s }[0], [x0], 4
+    st1         { v4.h }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v17.8h }, [sp], x10
+    calc_epelh  v4, v18, v19, v16, v17
+    calc_epelh2 v4, v5, v18, v19, v16, v17
+    sqrshrun    v4.8b, v4.8h, 6
+    st1         { v4.s }[0], [x0], 4
+    st1         { v4.h }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v18.8h }, [sp], x10
+    calc_epelh  v4, v19, v16, v17, v18
+    calc_epelh2 v4, v5, v19, v16, v17, v18
+    sqrshrun    v4.8b, v4.8h, 6
+    st1         { v4.s }[0], [x0], 4
+    st1         { v4.h }[2], [x0], x1
+    subs        x4, x4, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv8_8_neon, export=1
+    add         x10, x4, 3
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x6, [sp, -16]!
+    stp         xzr, x30, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3
+    mov         x2, x3
+    add         x3, x4, 3
+    mov         x4, x5
+    bl          ff_hevc_put_hevc_epel_h8_8_neon
+    ldp         xzr, x30, [sp], 16
+    ldp         x4, x6, [sp], 16
+    ldp         x0, x1, [sp], 16
+    load_epel_filterh x6, x5
+    mov         x10, 128
+    ld1         { v16.8h }, [sp], x10
+    ld1         { v17.8h }, [sp], x10
+    ld1         { v18.8h }, [sp], x10
+1:  ld1         { v19.8h }, [sp], x10
+    calc_epelh  v4, v16, v17, v18, v19
+    calc_epelh2 v4, v5, v16, v17, v18, v19
+    sqrshrun    v4.8b, v4.8h, 6
+    st1         { v4.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v16.8h }, [sp], x10
+    calc_epelh  v4, v17, v18, v19, v16
+    calc_epelh2 v4, v5, v17, v18, v19, v16
+    sqrshrun    v4.8b, v4.8h, 6
+    st1         { v4.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v17.8h }, [sp], x10
+    calc_epelh  v4, v18, v19, v16, v17
+    calc_epelh2 v4, v5, v18, v19, v16, v17
+    sqrshrun    v4.8b, v4.8h, 6
+    st1         { v4.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v18.8h }, [sp], x10
+    calc_epelh  v4, v19, v16, v17, v18
+    calc_epelh2 v4, v5, v19, v16, v17, v18
+    sqrshrun    v4.8b, v4.8h, 6
+    st1         { v4.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv12_8_neon, export=1
+    add         x10, x4, 3
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x6, [sp, -16]!
+    stp         xzr, x30, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3
+    mov         x2, x3
+    add         x3, x4, 3
+    mov         x4, x5
+    bl          ff_hevc_put_hevc_epel_h12_8_neon
+    ldp         xzr, x30, [sp], 16
+    ldp         x4, x6, [sp], 16
+    ldp         x0, x1, [sp], 16
+    load_epel_filterh x6, x5
+    sub         x1, x1, 8
+    mov         x10, 128
+    ld1         { v16.8h, v17.8h }, [sp], x10
+    ld1         { v18.8h, v19.8h }, [sp], x10
+    ld1         { v20.8h, v21.8h }, [sp], x10
+1:  ld1         { v22.8h, v23.8h }, [sp], x10
+    calc_epelh  v4, v16, v18, v20, v22
+    calc_epelh2 v4, v5, v16, v18, v20, v22
+    calc_epelh  v5, v17, v19, v21, v23
+    sqrshrun    v4.8b, v4.8h, 6
+    sqrshrun2   v4.16b, v5.8h, 6
+    st1         { v4.8b }, [x0], 8
+    st1         { v4.s }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v16.8h, v17.8h }, [sp], x10
+    calc_epelh  v4, v18, v20, v22, v16
+    calc_epelh2 v4, v5, v18, v20, v22, v16
+    calc_epelh  v5, v19, v21, v23, v17
+    sqrshrun    v4.8b, v4.8h, 6
+    sqrshrun2   v4.16b, v5.8h, 6
+    st1         { v4.8b }, [x0], 8
+    st1         { v4.s }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v18.8h, v19.8h }, [sp], x10
+    calc_epelh  v4, v20, v22, v16, v18
+    calc_epelh2 v4, v5, v20, v22, v16, v18
+    calc_epelh  v5, v21, v23, v17, v19
+    sqrshrun    v4.8b, v4.8h, 6
+    sqrshrun2   v4.16b, v5.8h, 6
+    st1         { v4.8b }, [x0], 8
+    st1         { v4.s }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v20.8h, v21.8h }, [sp], x10
+    calc_epelh  v4, v22, v16, v18, v20
+    calc_epelh2 v4, v5, v22, v16, v18, v20
+    calc_epelh  v5, v23, v17, v19, v21
+    sqrshrun    v4.8b, v4.8h, 6
+    sqrshrun2   v4.16b, v5.8h, 6
+    st1         { v4.8b }, [x0], 8
+    st1         { v4.s }[2], [x0], x1
+    subs        x4, x4, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv16_8_neon, export=1
+    add         x10, x4, 3
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x6, [sp, -16]!
+    stp         xzr, x30, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3
+    mov         x2, x3
+    add         x3, x4, 3
+    mov         x4, x5
+    bl          ff_hevc_put_hevc_epel_h16_8_neon
+    ldp         xzr, x30, [sp], 16
+    ldp         x4, x6, [sp], 16
+    ldp         x0, x1, [sp], 16
+    load_epel_filterh x6, x5
+    mov         x10, 128
+    ld1         { v16.8h, v17.8h }, [sp], x10
+    ld1         { v18.8h, v19.8h }, [sp], x10
+    ld1         { v20.8h, v21.8h }, [sp], x10
+1:  ld1         { v22.8h, v23.8h }, [sp], x10
+    calc_epelh  v4, v16, v18, v20, v22
+    calc_epelh2 v4, v5, v16, v18, v20, v22
+    calc_epelh  v5, v17, v19, v21, v23
+    calc_epelh2 v5, v6, v17, v19, v21, v23
+    sqrshrun    v4.8b, v4.8h, 6
+    sqrshrun2   v4.16b, v5.8h, 6
+    st1         { v4.16b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v16.8h, v17.8h }, [sp], x10
+    calc_epelh  v4, v18, v20, v22, v16
+    calc_epelh2 v4, v5, v18, v20, v22, v16
+    calc_epelh  v5, v19, v21, v23, v17
+    calc_epelh2 v5, v6, v19, v21, v23, v17
+    sqrshrun    v4.8b, v4.8h, 6
+    sqrshrun2   v4.16b, v5.8h, 6
+    st1         { v4.16b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v18.8h, v19.8h }, [sp], x10
+    calc_epelh  v4, v20, v22, v16, v18
+    calc_epelh2 v4, v5, v20, v22, v16, v18
+    calc_epelh  v5, v21, v23, v17, v19
+    calc_epelh2 v5, v6, v21, v23, v17, v19
+    sqrshrun    v4.8b, v4.8h, 6
+    sqrshrun2   v4.16b, v5.8h, 6
+    st1         { v4.16b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v20.8h, v21.8h }, [sp], x10
+    calc_epelh  v4, v22, v16, v18, v20
+    calc_epelh2 v4, v5, v22, v16, v18, v20
+    calc_epelh  v5, v23, v17, v19, v21
+    calc_epelh2 v5, v6, v23, v17, v19, v21
+    sqrshrun    v4.8b, v4.8h, 6
+    sqrshrun2   v4.16b, v5.8h, 6
+    st1         { v4.16b }, [x0], x1
+    subs        x4, x4, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv24_8_neon, export=1
+    add         x10, x4, 3
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x6, [sp, -16]!
+    stp         xzr, x30, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3
+    mov         x2, x3
+    add         x3, x4, 3
+    mov         x4, x5
+    bl          ff_hevc_put_hevc_epel_h24_8_neon
+    ldp         xzr, x30, [sp], 16
+    ldp         x4, x6, [sp], 16
+    ldp         x0, x1, [sp], 16
+    load_epel_filterh x6, x5
+    mov         x10, 128
+    ld1         { v16.8h, v17.8h, v18.8h }, [sp], x10
+    ld1         { v19.8h, v20.8h, v21.8h }, [sp], x10
+    ld1         { v22.8h, v23.8h, v24.8h }, [sp], x10
+1:  ld1         { v25.8h, v26.8h, v27.8h }, [sp], x10
+    calc_epelh  v4, v16, v19, v22, v25
+    calc_epelh2 v4, v5, v16, v19, v22, v25
+    calc_epelh  v5, v17, v20, v23, v26
+    calc_epelh2 v5, v6, v17, v20, v23, v26
+    calc_epelh  v6, v18, v21, v24, v27
+    calc_epelh2 v6, v7, v18, v21, v24, v27
+    sqrshrun    v4.8b, v4.8h, 6
+    sqrshrun    v5.8b, v5.8h, 6
+    sqrshrun    v6.8b, v6.8h, 6
+    st1         { v4.8b, v5.8b, v6.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v16.8h, v17.8h, v18.8h }, [sp], x10
+    calc_epelh  v4, v19, v22, v25, v16
+    calc_epelh2 v4, v5, v19, v22, v25, v16
+    calc_epelh  v5, v20, v23, v26, v17
+    calc_epelh2 v5, v6, v20, v23, v26, v17
+    calc_epelh  v6, v21, v24, v27, v18
+    calc_epelh2 v6, v7, v21, v24, v27, v18
+    sqrshrun    v4.8b, v4.8h, 6
+    sqrshrun    v5.8b, v5.8h, 6
+    sqrshrun    v6.8b, v6.8h, 6
+    st1         { v4.8b, v5.8b, v6.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v19.8h, v20.8h, v21.8h }, [sp], x10
+    calc_epelh  v4, v22, v25, v16, v19
+    calc_epelh2 v4, v5, v22, v25, v16, v19
+    calc_epelh  v5, v23, v26, v17, v20
+    calc_epelh2 v5, v6, v23, v26, v17, v20
+    calc_epelh  v6, v24, v27, v18, v21
+    calc_epelh2 v6, v7, v24, v27, v18, v21
+    sqrshrun    v4.8b, v4.8h, 6
+    sqrshrun    v5.8b, v5.8h, 6
+    sqrshrun    v6.8b, v6.8h, 6
+    st1         { v4.8b, v5.8b, v6.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v22.8h, v23.8h, v24.8h }, [sp], x10
+    calc_epelh  v4, v25, v16, v19, v22
+    calc_epelh2 v4, v5, v25, v16, v19, v22
+    calc_epelh  v5, v26, v17, v20, v23
+    calc_epelh2 v5, v6, v26, v17, v20, v23
+    calc_epelh  v6, v27, v18, v21, v24
+    calc_epelh2 v6, v7, v27, v18, v21, v24
+    sqrshrun    v4.8b, v4.8h, 6
+    sqrshrun    v5.8b, v5.8h, 6
+    sqrshrun    v6.8b, v6.8h, 6
+    st1         { v4.8b, v5.8b, v6.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv32_8_neon, export=1
+    stp         x0, x30, [sp, -16]!
+    stp         x1, x2, [sp, -16]!
+    stp         x3, x4, [sp, -16]!
+    stp         x5, x6, [sp, -16]!
+    mov         x7, 16
+    bl          ff_hevc_put_hevc_epel_uni_hv16_8_neon
+    ldp         x5, x6, [sp], 16
+    ldp         x3, x4, [sp], 16
+    ldp         x1, x2, [sp], 16
+    ldr         x0, [sp]
+    add         x0, x0, 16
+    add         x2, x2, 16
+    mov         x7, 16
+    bl          ff_hevc_put_hevc_epel_uni_hv16_8_neon
+    ldp         xzr, x30, [sp], 16
+    ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv48_8_neon, export=1
+    stp         x0, x30, [sp, -16]!
+    stp         x1, x2, [sp, -16]!
+    stp         x3, x4, [sp, -16]!
+    stp         x5, x6, [sp, -16]!
+    mov         x7, 24
+    bl          ff_hevc_put_hevc_epel_uni_hv24_8_neon
+    ldp         x5, x6, [sp], 16
+    ldp         x3, x4, [sp], 16
+    ldp         x1, x2, [sp], 16
+    ldr         x0, [sp]
+    add         x0, x0, 24
+    add         x2, x2, 24
+    mov         x7, 24
+    bl          ff_hevc_put_hevc_epel_uni_hv24_8_neon
+    ldp         xzr, x30, [sp], 16
+    ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv64_8_neon, export=1
+    stp         x0, x30, [sp, -16]!
+    stp         x1, x2, [sp, -16]!
+    stp         x3, x4, [sp, -16]!
+    stp         x5, x6, [sp, -16]!
+    mov         x7, 16
+    bl          ff_hevc_put_hevc_epel_uni_hv16_8_neon
+    ldp         x5, x6, [sp]
+    ldp         x3, x4, [sp, 16]
+    ldp         x1, x2, [sp, 32]
+    ldr         x0, [sp, 48]
+    add         x0, x0, 16
+    add         x2, x2, 16
+    mov         x7, 16
+    bl          ff_hevc_put_hevc_epel_uni_hv16_8_neon
+    ldp         x5, x6, [sp]
+    ldp         x3, x4, [sp, 16]
+    ldp         x1, x2, [sp, 32]
+    ldr         x0, [sp, 48]
+    add         x0, x0, 32
+    add         x2, x2, 32
+    mov         x7, 16
+    bl          ff_hevc_put_hevc_epel_uni_hv16_8_neon
+    ldp         x5, x6, [sp], 16
+    ldp         x3, x4, [sp], 16
+    ldp         x1, x2, [sp], 16
+    ldr         x0, [sp]
+    add         x0, x0, 48
+    add         x2, x2, 48
+    mov         x7, 16
+    bl          ff_hevc_put_hevc_epel_uni_hv16_8_neon
+    ldp         xzr, x30, [sp], 16
+    ret
+endfunc
+
+
+
+
+function ff_hevc_put_hevc_epel_bi_h4_8_neon, export=1
+	load_epel_filterb x6, x7
+	sub			x2, x2, 1
+	mov			x10, 128
+1:	ld1			{ v4.8b }, [x2], x3
+	ushr		v5.2d, v4.2d, 8
+	ushr		v6.2d, v5.2d, 8
+	ushr		v7.2d, v6.2d, 8
+	movi		v16.8h, 0
+	calc_epelb	v16, v4, v5, v6, v7
+	ld1			{ v20.4h }, [x4], x10
+	sqadd		v16.8h, v16.8h, v20.8h
+	sqrshrun	v4.8b, v16.8h, 7
+	st1			{ v4.s }[0], [x0], x1
+	subs		x5, x5, 1	// height
+	b.ne		1b
+	ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h6_8_neon, export=1
+	load_epel_filterb x6, x7
+	sub			x1, x1, 4
+	sub			x2, x2, 1
+	sub			x3, x3, 8
+	mov			x10, 128
+1:	ld1			{ v24.8b }, [x2], 8
+	ushr		v26.2d, v24.2d, 8
+	ushr		v27.2d, v26.2d, 8
+	ushr		v28.2d, v27.2d, 8
+	movi		v16.8h, 0
+	ld1			{ v28.b }[5], [x2], x3
+	calc_epelb	v16, v24, v26, v27, v28
+	ld1			{ v20.8h }, [x4], x10
+	sqadd		v16.8h, v16.8h, v20.8h
+	sqrshrun	v16.8b, v16.8h, 7
+	st1			{ v16.s }[0], [x0], 4
+	st1			{ v16.h }[2], [x0], x1
+	subs		x5, x5, 1	// height
+	b.ne		1b
+	ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h8_8_neon, export=1
+	load_epel_filterb x6, x7
+	sub			x2, x2, 1
+	mov			x10, 128
+1:	ld2			{ v24.8b, v25.8b }, [x2], x3
+	ushr		v26.2d, v24.2d, 8
+	ushr		v27.2d, v25.2d, 8
+	ushr		v28.2d, v26.2d, 8
+	movi		v16.8h, 0
+	movi		v17.8h, 0
+	calc_epelb	v16, v24, v25, v26, v27
+	calc_epelb	v17, v25, v26, v27, v28
+	zip1		v16.8h, v16.8h, v17.8h
+	ld1			{ v20.8h }, [x4], x10
+	sqadd		v16.8h, v16.8h, v20.8h
+	sqrshrun	v16.8b, v16.8h, 7
+	st1			{ v16.8b }, [x0], x1
+	subs		x5, x5, 1	// height
+	b.ne		1b
+	ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h12_8_neon, export=1
+	load_epel_filterb x6, x7
+	sub			x1, x1, 8
+	sub			x2, x2, 1
+	mov			x10, 128
+1:	ld2			{ v24.8b, v25.8b }, [x2], x3
+	ushr		v26.2d, v24.2d, 8
+	ushr		v27.2d, v25.2d, 8
+	ushr		v28.2d, v26.2d, 8
+	movi		v16.8h, 0
+	movi		v17.8h, 0
+	calc_epelb	v16, v24, v25, v26, v27
+	calc_epelb	v17, v25, v26, v27, v28
+	zip1		v18.8h, v16.8h, v17.8h
+	zip2		v19.8h, v16.8h, v17.8h
+	ld1			{ v20.8h, v21.8h }, [x4], x10
+	sqadd		v18.8h, v18.8h, v20.8h
+	sqadd		v19.8h, v19.8h, v21.8h
+	sqrshrun	v20.8b, v18.8h, 7
+	sqrshrun	v21.8b, v19.8h, 7
+	st1			{ v20.8b }, [x0], 8
+	st1			{ v21.s }[0], [x0], x1
+	subs		x5, x5, 1	// height
+	b.ne		1b
+	ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h16_8_neon, export=1
+	load_epel_filterb x6, x7
+	sub			x2, x2, 1
+	sub			x3, x3, 16
+	mov			x10, 128
+1:	ld2			{ v24.8b, v25.8b }, [x2], 16
+	ld1			{ v20.s }[0], [x2], x3
+	ushr		v26.2d, v24.2d, 8
+	ushr		v27.2d, v25.2d, 8
+	mov			v26.b[7], v20.b[0]
+	mov			v27.b[7], v20.b[1]
+	ushr		v28.2d, v26.2d, 8
+	mov			v28.b[7], v20.b[2]
+	movi		v16.8h, 0
+	movi		v17.8h, 0
+	calc_epelb	v16, v24, v25, v26, v27
+	calc_epelb	v17, v25, v26, v27, v28
+	zip1		v18.8h, v16.8h, v17.8h
+	zip2		v19.8h, v16.8h, v17.8h
+	ld2			{ v24.8h, v25.8h }, [x4], x10
+	sqadd		v16.8h, v16.8h, v24.8h
+	sqadd		v17.8h, v17.8h, v25.8h
+	sqrshrun	v4.8b, v16.8h, 7
+	sqrshrun	v5.8b, v17.8h, 7
+	st2			{ v4.8b, v5.8b }, [x0], x1
+	subs		x5, x5, 1	// height
+	b.ne		1b
+	ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h24_8_neon, export=1
+	load_epel_filterb x6, x7
+	sub			x2, x2, 1
+	sub			x3, x3, 24
+	mov			x10, 128
+1:	ld3			{ v24.8b, v25.8b, v26.8b }, [x2], 24
+	ld1			{ v20.s }[0], [x2], x3
+	ushr		v27.2d, v24.2d, 8
+	ushr		v28.2d, v25.2d, 8
+	ushr		v29.2d, v26.2d, 8
+	mov			v27.b[7], v20.b[0]
+	mov			v28.b[7], v20.b[1]
+	mov			v29.b[7], v20.b[2]
+	movi		v16.8h, 0
+	movi		v17.8h, 0
+	movi		v18.8h, 0
+	calc_epelb	v16, v24, v25, v26, v27
+	calc_epelb	v17, v25, v26, v27, v28
+	calc_epelb	v18, v26, v27, v28, v29
+	ld3			{ v24.8h, v25.8h, v26.8h }, [x4], x10
+	sqadd		v16.8h, v16.8h, v24.8h
+	sqadd		v17.8h, v17.8h, v25.8h
+	sqadd		v18.8h, v18.8h, v26.8h
+	sqrshrun	v4.8b, v16.8h, 7
+	sqrshrun	v5.8b, v17.8h, 7
+	sqrshrun	v6.8b, v18.8h, 7
+	st3			{ v4.8b, v5.8b, v6.8b }, [x0], x1
+	subs		x5, x5, 1	// height
+	b.ne		1b
+	ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h32_8_neon, export=1
+	load_epel_filterb x6, x7
+	sub			x2, x2, 1
+	sub			x3, x3, 32
+	mov			x10, 128
+1:	ld4			{ v24.8b, v25.8b, v26.8b, v27.8b }, [x2], 32
+	ld1			{ v20.s }[0], [x2], x3
+	ushr		v28.2d, v24.2d, 8
+	ushr		v29.2d, v25.2d, 8
+	ushr		v30.2d, v26.2d, 8
+	ins			v28.b[7], v20.b[0]
+	ins			v29.b[7], v20.b[1]
+	ins			v30.b[7], v20.b[2]
+	movi		v16.8h, 0
+	movi		v17.8h, 0
+	movi		v18.8h, 0
+	movi		v19.8h, 0
+	calc_epelb	v16, v24, v25, v26, v27
+	calc_epelb	v17, v25, v26, v27, v28
+	calc_epelb	v18, v26, v27, v28, v29
+	calc_epelb	v19, v27, v28, v29, v30
+	ld4			{ v24.8h, v25.8h, v26.8h, v27.8h }, [x4], x10
+	sqadd		v16.8h, v16.8h, v24.8h
+	sqadd		v17.8h, v17.8h, v25.8h
+	sqadd		v18.8h, v18.8h, v26.8h
+	sqadd		v19.8h, v19.8h, v27.8h
+	sqrshrun	v4.8b, v16.8h, 7
+	sqrshrun	v5.8b, v17.8h, 7
+	sqrshrun	v6.8b, v18.8h, 7
+	sqrshrun	v7.8b, v19.8h, 7
+	st4			{ v4.8b, v5.8b, v6.8b, v7.8b }, [x0], x1
+	subs		x5, x5, 1
+	b.ne		1b
+	ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h48_8_neon, export=1
+	load_epel_filterb x6, x7
+	sub			x2, x2, 1
+	sub			x3, x3, 48
+	mov			x7, 24
+	mov			x10, 128 - 48
+1:	ld3			{ v26.16b, v27.16b, v28.16b }, [x2], x7
+	ushr		v29.2d, v26.2d, 8
+	ushr		v30.2d, v27.2d, 8
+	ushr		v31.2d, v28.2d, 8
+	ld1			{ v24.s }[0], [x2], x7
+	ld1			{ v25.s }[0], [x2], x3
+	mov			v29.b[7], v24.b[0]
+	mov			v30.b[7], v24.b[1]
+	mov			v31.b[7], v24.b[2]
+	mov			v29.b[15], v25.b[0]
+	mov			v30.b[15], v25.b[1]
+	mov			v31.b[15], v25.b[2]
+	movi		v16.8h, 0
+	movi		v17.8h, 0
+	movi		v18.8h, 0
+	movi		v20.8h, 0
+	movi		v21.8h, 0
+	movi		v22.8h, 0
+	calc_epelb	v16, v26, v27, v28, v29
+	calc_epelb2	v20, v26, v27, v28, v29
+	calc_epelb	v17, v27, v28, v29, v30
+	calc_epelb2	v21, v27, v28, v29, v30
+	calc_epelb	v18, v28, v29, v30, v31
+	calc_epelb2	v22, v28, v29, v30, v31
+	ld3			{ v24.8h, v25.8h, v26.8h }, [x4], 48
+	sqadd		v16.8h, v16.8h, v24.8h
+	sqadd		v17.8h, v17.8h, v25.8h
+	sqadd		v18.8h, v18.8h, v26.8h
+	ld3			{ v27.8h, v28.8h, v29.8h }, [x4], x10
+	sqadd		v20.8h, v20.8h, v27.8h
+	sqadd		v21.8h, v21.8h, v28.8h
+	sqadd		v22.8h, v22.8h, v29.8h
+	sqrshrun	v4.8b, v16.8h, 7
+	sqrshrun	v5.8b, v17.8h, 7
+	sqrshrun	v6.8b, v18.8h, 7
+	sqrshrun2	v4.16b, v20.8h, 7
+	sqrshrun2	v5.16b, v21.8h, 7
+	sqrshrun2	v6.16b, v22.8h, 7
+	st3			{ v4.16b, v5.16b, v6.16b }, [x0], x1
+	subs		x5, x5, 1	// height
+	b.ne		1b
+	ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h64_8_neon, export=1
+	load_epel_filterb x6, x7
+	sub			x2, x2, 1
+	sub			x3, x3, 64
+	mov			x7, 32
+1:	ld4			{ v24.16b, v25.16b, v26.16b, v27.16b }, [x2], x7
+	ushr		v28.2d, v24.2d, 8
+	ushr		v29.2d, v25.2d, 8
+	ushr		v30.2d, v26.2d, 8
+	ld1			{ v4.s }[0], [x2], x7
+	ld1			{ v5.s }[0], [x2], x3
+	ins			v28.b[7], v4.b[0]
+	ins			v28.b[15], v5.b[0]
+	ins			v29.b[7], v4.b[1]
+	ins			v29.b[15], v5.b[1]
+	ins			v30.b[7], v4.b[2]
+	ins			v30.b[15], v5.b[2]
+	movi		v16.8h, 0
+	movi		v17.8h, 0
+	movi		v18.8h, 0
+	movi		v19.8h, 0
+	movi		v20.8h, 0
+	movi		v21.8h, 0
+	movi		v22.8h, 0
+	movi		v23.8h, 0
+	calc_epelb	v16, v24, v25, v26, v27
+	calc_epelb2	v20, v24, v25, v26, v27
+	calc_epelb	v17, v25, v26, v27, v28
+	calc_epelb2	v21, v25, v26, v27, v28
+	calc_epelb	v18, v26, v27, v28, v29
+	calc_epelb2	v22, v26, v27, v28, v29
+	calc_epelb	v19, v27, v28, v29, v30
+	calc_epelb2	v23, v27, v28, v29, v30
+	ld4			{ v24.8h, v25.8h, v26.8h, v27.8h }, [x4], 64
+	sqadd		v16.8h, v16.8h, v24.8h
+	sqadd		v17.8h, v17.8h, v25.8h
+	sqadd		v18.8h, v18.8h, v26.8h
+	sqadd		v19.8h, v19.8h, v27.8h
+	ld4			{ v28.8h, v29.8h, v30.8h, v31.8h }, [x4], 64
+	sqadd		v20.8h, v20.8h, v28.8h
+	sqadd		v21.8h, v21.8h, v29.8h
+	sqadd		v22.8h, v22.8h, v30.8h
+	sqadd		v23.8h, v23.8h, v31.8h
+	sqrshrun	v4.8b, v16.8h, 7
+	sqrshrun	v5.8b, v17.8h, 7
+	sqrshrun	v6.8b, v18.8h, 7
+	sqrshrun	v7.8b, v19.8h, 7
+	sqrshrun2	v4.16b, v20.8h, 7
+	sqrshrun2	v5.16b, v21.8h, 7
+	sqrshrun2	v6.16b, v22.8h, 7
+	sqrshrun2	v7.16b, v23.8h, 7
+	st4			{ v4.16b, v5.16b, v6.16b, v7.16b }, [x0], x1
+	subs		x5, x5, 1
+	b.ne		1b
+	ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_epel_bi_v4_8_neon, export=1
+    load_epel_filterb x7, x6
+    sub         x2, x2, x3
+    mov         x10, 128
+    ld1         { v16.s }[0], [x2], x3
+    ld1         { v17.s }[0], [x2], x3
+    ld1         { v18.s }[0], [x2], x3
+1:  ld1         { v19.s }[0], [x2], x3
+    movi        v4.8h, 0
+    calc_epelb  v4, v16, v17, v18, v19
+    ld1         { v24.4h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v24.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v4.s }[0], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v16.s }[0], [x2], x3
+    movi        v4.8h, 0
+    calc_epelb  v4, v17, v18, v19, v16
+    ld1         { v24.4h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v24.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v4.s }[0], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v17.s }[0], [x2], x3
+    movi        v4.8h, 0
+    calc_epelb  v4, v18, v19, v16, v17
+    ld1         { v24.4h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v24.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v4.s }[0], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v18.s }[0], [x2], x3
+    movi        v4.8h, 0
+    calc_epelb  v4, v19, v16, v17, v18
+    ld1         { v24.4h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v24.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v4.s }[0], [x0], x1
+    subs        x5, x5, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v6_8_neon, export=1
+    load_epel_filterb x7, x6
+    sub         x2, x2, x3
+    sub         x1, x1, 4
+    mov         x10, 128
+    ld1         { v16.8b }, [x2], x3
+    ld1         { v17.8b }, [x2], x3
+    ld1         { v18.8b }, [x2], x3
+1:  ld1         { v19.8b }, [x2], x3
+    movi        v4.8h, 0
+    calc_epelb  v4, v16, v17, v18, v19
+    ld1         { v24.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v24.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v4.s }[0], [x0], 4
+    st1         { v4.h }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v16.8b }, [x2], x3
+    movi        v4.8h, 0
+    calc_epelb  v4, v17, v18, v19, v16
+    ld1         { v24.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v24.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v4.s }[0], [x0], 4
+    st1         { v4.h }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v17.8b }, [x2], x3
+    movi        v4.8h, 0
+    calc_epelb  v4, v18, v19, v16, v17
+    ld1         { v24.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v24.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v4.s }[0], [x0], 4
+    st1         { v4.h }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v18.8b }, [x2], x3
+    movi        v4.8h, 0
+    calc_epelb  v4, v19, v16, v17, v18
+    ld1         { v24.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v24.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v4.s }[0], [x0], 4
+    st1         { v4.h }[2], [x0], x1
+    subs        x5, x5, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v8_8_neon, export=1
+    load_epel_filterb x7, x6
+    sub         x2, x2, x3
+    mov         x10, 128
+    ld1         { v16.8b }, [x2], x3
+    ld1         { v17.8b }, [x2], x3
+    ld1         { v18.8b }, [x2], x3
+1:  ld1         { v19.8b }, [x2], x3
+    movi        v4.8h, 0
+    calc_epelb  v4, v16, v17, v18, v19
+    ld1         { v24.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v24.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v4.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v16.8b }, [x2], x3
+    movi        v4.8h, 0
+    calc_epelb  v4, v17, v18, v19, v16
+    ld1         { v24.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v24.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v4.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v17.8b }, [x2], x3
+    movi        v4.8h, 0
+    calc_epelb  v4, v18, v19, v16, v17
+    ld1         { v24.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v24.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v4.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v18.8b }, [x2], x3
+    movi        v4.8h, 0
+    calc_epelb  v4, v19, v16, v17, v18
+    ld1         { v24.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v24.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v4.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v12_8_neon, export=1
+    load_epel_filterb x7, x6
+    sub         x1, x1, 8
+    sub         x2, x2, x3
+    mov         x10, 128
+    ld1         { v16.16b }, [x2], x3
+    ld1         { v17.16b }, [x2], x3
+    ld1         { v18.16b }, [x2], x3
+1:  ld1         { v19.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    calc_epelb  v4, v16, v17, v18, v19
+    calc_epelb2 v5, v16, v17, v18, v19
+    ld1         { v24.8h, v25.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v24.8h
+    sqadd       v5.8h, v5.8h, v25.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    sqrshrun2   v4.16b, v5.8h, 7
+    st1         { v4.8b }, [x0], 8
+    st1         { v4.s }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v16.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    calc_epelb  v4, v17, v18, v19, v16
+    calc_epelb2 v5, v17, v18, v19, v16
+    ld1         { v24.8h, v25.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v24.8h
+    sqadd       v5.8h, v5.8h, v25.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    sqrshrun2   v4.16b, v5.8h, 7
+    st1         { v4.8b }, [x0], 8
+    st1         { v4.s }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v17.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    calc_epelb  v4, v18, v19, v16, v17
+    calc_epelb2 v5, v18, v19, v16, v17
+    ld1         { v24.8h, v25.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v24.8h
+    sqadd       v5.8h, v5.8h, v25.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    sqrshrun2   v4.16b, v5.8h, 7
+    st1         { v4.8b }, [x0], 8
+    st1         { v4.s }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v18.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    calc_epelb  v4, v19, v16, v17, v18
+    calc_epelb2 v5, v19, v16, v17, v18
+    ld1         { v24.8h, v25.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v24.8h
+    sqadd       v5.8h, v5.8h, v25.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    sqrshrun2   v4.16b, v5.8h, 7
+    st1         { v4.8b }, [x0], 8
+    st1         { v4.s }[2], [x0], x1
+    subs        x5, x5, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v16_8_neon, export=1
+    load_epel_filterb x7, x6
+    sub         x2, x2, x3
+    mov         x10, 128
+    ld1         { v16.16b }, [x2], x3
+    ld1         { v17.16b }, [x2], x3
+    ld1         { v18.16b }, [x2], x3
+1:  ld1         { v19.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    calc_epelb  v4, v16, v17, v18, v19
+    calc_epelb2 v5, v16, v17, v18, v19
+    ld1         { v24.8h, v25.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v24.8h
+    sqadd       v5.8h, v5.8h, v25.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    sqrshrun2   v4.16b, v5.8h, 7
+    st1         { v4.16b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v16.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    calc_epelb  v4, v17, v18, v19, v16
+    calc_epelb2 v5, v17, v18, v19, v16
+    ld1         { v24.8h, v25.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v24.8h
+    sqadd       v5.8h, v5.8h, v25.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    sqrshrun2   v4.16b, v5.8h, 7
+    st1         { v4.16b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v17.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    calc_epelb  v4, v18, v19, v16, v17
+    calc_epelb2 v5, v18, v19, v16, v17
+    ld1         { v24.8h, v25.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v24.8h
+    sqadd       v5.8h, v5.8h, v25.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    sqrshrun2   v4.16b, v5.8h, 7
+    st1         { v4.16b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v18.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    calc_epelb  v4, v19, v16, v17, v18
+    calc_epelb2 v5, v19, v16, v17, v18
+    ld1         { v24.8h, v25.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v24.8h
+    sqadd       v5.8h, v5.8h, v25.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    sqrshrun2   v4.16b, v5.8h, 7
+    st1         { v4.16b }, [x0], x1
+    subs        x5, x5, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v24_8_neon, export=1
+    load_epel_filterb x7, x6
+    sub         x2, x2, x3
+    mov         x10, 128
+    ld1         { v16.8b, v17.8b, v18.8b }, [x2], x3
+    ld1         { v19.8b, v20.8b, v21.8b }, [x2], x3
+    ld1         { v22.8b, v23.8b, v24.8b }, [x2], x3
+1:  ld1         { v25.8b, v26.8b, v27.8b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    calc_epelb  v4, v16, v19, v22, v25
+    calc_epelb  v5, v17, v20, v23, v26
+    calc_epelb  v6, v18, v21, v24, v27
+    ld1         { v28.8h, v29.8h, v30.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v28.8h
+    sqadd       v5.8h, v5.8h, v29.8h
+    sqadd       v6.8h, v6.8h, v30.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    sqrshrun    v5.8b, v5.8h, 7
+    sqrshrun    v6.8b, v6.8h, 7
+    st1         { v4.8b, v5.8b, v6.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v16.8b, v17.8b, v18.8b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    calc_epelb  v4, v19, v22, v25, v16
+    calc_epelb  v5, v20, v23, v26, v17
+    calc_epelb  v6, v21, v24, v27, v18
+    ld1         { v28.8h, v29.8h, v30.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v28.8h
+    sqadd       v5.8h, v5.8h, v29.8h
+    sqadd       v6.8h, v6.8h, v30.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    sqrshrun    v5.8b, v5.8h, 7
+    sqrshrun    v6.8b, v6.8h, 7
+    st1         { v4.8b, v5.8b, v6.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v19.8b, v20.8b, v21.8b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    calc_epelb  v4, v22, v25, v16, v19
+    calc_epelb  v5, v23, v26, v17, v20
+    calc_epelb  v6, v24, v27, v18, v21
+    ld1         { v28.8h, v29.8h, v30.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v28.8h
+    sqadd       v5.8h, v5.8h, v29.8h
+    sqadd       v6.8h, v6.8h, v30.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    sqrshrun    v5.8b, v5.8h, 7
+    sqrshrun    v6.8b, v6.8h, 7
+    st1         { v4.8b, v5.8b, v6.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v22.8b, v23.8b, v24.8b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    calc_epelb  v4, v25, v16, v19, v22
+    calc_epelb  v5, v26, v17, v20, v23
+    calc_epelb  v6, v27, v18, v21, v24
+    ld1         { v28.8h, v29.8h, v30.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v28.8h
+    sqadd       v5.8h, v5.8h, v29.8h
+    sqadd       v6.8h, v6.8h, v30.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    sqrshrun    v5.8b, v5.8h, 7
+    sqrshrun    v6.8b, v6.8h, 7
+    st1         { v4.8b, v5.8b, v6.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v32_8_neon, export=1
+    load_epel_filterb x7, x6
+    sub         x2, x2, x3
+    mov         x10, 128
+    ld1         { v16.16b, v17.16b }, [x2], x3
+    ld1         { v18.16b, v19.16b }, [x2], x3
+    ld1         { v20.16b, v21.16b }, [x2], x3
+1:  ld1         { v22.16b, v23.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    calc_epelb  v4, v16, v18, v20, v22
+    calc_epelb2 v5, v16, v18, v20, v22
+    calc_epelb  v6, v17, v19, v21, v23
+    calc_epelb2 v7, v17, v19, v21, v23
+    ld1         { v24.8h, v25.8h, v26.8h, v27.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v24.8h
+    sqadd       v5.8h, v5.8h, v25.8h
+    sqadd       v6.8h, v6.8h, v26.8h
+    sqadd       v7.8h, v7.8h, v27.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    sqrshrun2   v4.16b, v5.8h, 7
+    sqrshrun    v5.8b, v6.8h, 7
+    sqrshrun2   v5.16b, v7.8h, 7
+    st1         { v4.16b, v5.16b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v16.16b, v17.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    calc_epelb  v4, v18, v20, v22, v16
+    calc_epelb2 v5, v18, v20, v22, v16
+    calc_epelb  v6, v19, v21, v23, v17
+    calc_epelb2 v7, v19, v21, v23, v17
+    ld1         { v24.8h, v25.8h, v26.8h, v27.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v24.8h
+    sqadd       v5.8h, v5.8h, v25.8h
+    sqadd       v6.8h, v6.8h, v26.8h
+    sqadd       v7.8h, v7.8h, v27.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    sqrshrun2   v4.16b, v5.8h, 7
+    sqrshrun    v5.8b, v6.8h, 7
+    sqrshrun2   v5.16b, v7.8h, 7
+    st1         { v4.16b, v5.16b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v18.16b, v19.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    calc_epelb  v4, v20, v22, v16, v18
+    calc_epelb2 v5, v20, v22, v16, v18
+    calc_epelb  v6, v21, v23, v17, v19
+    calc_epelb2 v7, v21, v23, v17, v19
+    ld1         { v24.8h, v25.8h, v26.8h, v27.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v24.8h
+    sqadd       v5.8h, v5.8h, v25.8h
+    sqadd       v6.8h, v6.8h, v26.8h
+    sqadd       v7.8h, v7.8h, v27.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    sqrshrun2   v4.16b, v5.8h, 7
+    sqrshrun    v5.8b, v6.8h, 7
+    sqrshrun2   v5.16b, v7.8h, 7
+    st1         { v4.16b, v5.16b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v20.16b, v21.16b }, [x2], x3
+    movi        v4.8h, 0
+    movi        v5.8h, 0
+    movi        v6.8h, 0
+    movi        v7.8h, 0
+    calc_epelb  v4, v22, v16, v18, v20
+    calc_epelb2 v5, v22, v16, v18, v20
+    calc_epelb  v6, v23, v17, v19, v21
+    calc_epelb2 v7, v23, v17, v19, v21
+    ld1         { v24.8h, v25.8h, v26.8h, v27.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v24.8h
+    sqadd       v5.8h, v5.8h, v25.8h
+    sqadd       v6.8h, v6.8h, v26.8h
+    sqadd       v7.8h, v7.8h, v27.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    sqrshrun2   v4.16b, v5.8h, 7
+    sqrshrun    v5.8b, v6.8h, 7
+    sqrshrun2   v5.16b, v7.8h, 7
+    st1         { v4.16b, v5.16b }, [x0], x1
+    subs        x5, x5, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v48_8_neon, export=1
+    stp         x7, x30, [sp, -16]!
+    stp         x0, x1, [sp, -16]!
+    stp         x2, x3, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    bl          ff_hevc_put_hevc_epel_bi_v24_8_neon
+    ldp         x4, x5, [sp], 16
+    ldp         x2, x3, [sp], 16
+    ldp         x0, x1, [sp], 16
+    add         x0, x0, 24
+    add         x2, x2, 24
+    add         x4, x4, 48
+    ldr         x7, [sp]
+    bl          ff_hevc_put_hevc_epel_bi_v24_8_neon
+    ldp         xzr, x30, [sp], 16
+    ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v64_8_neon, export=1
+    stp         x7, x30, [sp, -16]!
+    stp         x0, x1, [sp, -16]!
+    stp         x2, x3, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    bl          ff_hevc_put_hevc_epel_bi_v32_8_neon
+    ldp         x4, x5, [sp], 16
+    ldp         x2, x3, [sp], 16
+    ldp         x0, x1, [sp], 16
+    add         x0, x0, 32
+    add         x2, x2, 32
+    add         x4, x4, 64
+    ldr         x7, [sp]
+    bl          ff_hevc_put_hevc_epel_bi_v32_8_neon
+    ldp         xzr, x30, [sp], 16
+    ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_epel_bi_hv4_8_neon, export=1
+    add         x10, x5, 3
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    stp         x7, x30, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3
+    mov         x2, x3
+    add         x3, x5, 3
+    mov         x4, x6
+    mov         x5, x7
+    bl          ff_hevc_put_hevc_epel_h4_8_neon
+    ldp         x7, x30, [sp], 16
+    ldp         x4, x5, [sp], 16
+    ldp         x0, x1, [sp], 16
+    load_epel_filterh x7, x6
+    mov         x10, 128
+    ld1         { v16.4h }, [sp], x10
+    ld1         { v17.4h }, [sp], x10
+    ld1         { v18.4h }, [sp], x10
+1:  ld1         { v19.4h }, [sp], x10
+    calc_epelh  v4, v16, v17, v18, v19
+    ld1         { v6.4h }, [x4], x10
+    sqadd       v4.4h, v4.4h, v6.4h
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v4.s }[0], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v16.4h }, [sp], x10
+    calc_epelh  v4, v17, v18, v19, v16
+    ld1         { v6.4h }, [x4], x10
+    sqadd       v4.4h, v4.4h, v6.4h
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v4.s }[0], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v17.4h }, [sp], x10
+    calc_epelh  v4, v18, v19, v16, v17
+    ld1         { v6.4h }, [x4], x10
+    sqadd       v4.4h, v4.4h, v6.4h
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v4.s }[0], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v18.4h }, [sp], x10
+    calc_epelh  v4, v19, v16, v17, v18
+    ld1         { v6.4h }, [x4], x10
+    sqadd       v4.4h, v4.4h, v6.4h
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v4.s }[0], [x0], x1
+    subs        x5, x5, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv6_8_neon, export=1
+    add         x10, x5, 3
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    stp         x7, x30, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3
+    mov         x2, x3
+    add         x3, x5, 3
+    mov         x4, x6
+    mov         x5, x7
+    bl          ff_hevc_put_hevc_epel_h6_8_neon
+    ldp         x7, x30, [sp], 16
+    ldp         x4, x5, [sp], 16
+    ldp         x0, x1, [sp], 16
+    load_epel_filterh x7, x6
+    sub         x1, x1, 4
+    mov         x10, 128
+    ld1         { v16.8h }, [sp], x10
+    ld1         { v17.8h }, [sp], x10
+    ld1         { v18.8h }, [sp], x10
+1:  ld1         { v19.8h }, [sp], x10
+    calc_epelh  v4, v16, v17, v18, v19
+    calc_epelh2 v4, v5, v16, v17, v18, v19
+    ld1         { v6.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v6.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v4.s }[0], [x0], 4
+    st1         { v4.h }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v16.8h }, [sp], x10
+    calc_epelh  v4, v17, v18, v19, v16
+    calc_epelh2 v4, v5, v17, v18, v19, v16
+    ld1         { v6.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v6.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v4.s }[0], [x0], 4
+    st1         { v4.h }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v17.8h }, [sp], x10
+    calc_epelh  v4, v18, v19, v16, v17
+    calc_epelh2 v4, v5, v18, v19, v16, v17
+    ld1         { v6.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v6.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v4.s }[0], [x0], 4
+    st1         { v4.h }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v18.8h }, [sp], x10
+    calc_epelh  v4, v19, v16, v17, v18
+    calc_epelh2 v4, v5, v19, v16, v17, v18
+    ld1         { v6.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v6.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v4.s }[0], [x0], 4
+    st1         { v4.h }[2], [x0], x1
+    subs        x5, x5, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv8_8_neon, export=1
+    add         x10, x5, 3
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    stp         x7, x30, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3
+    mov         x2, x3
+    add         x3, x5, 3
+    mov         x4, x6
+    mov         x5, x7
+    bl          ff_hevc_put_hevc_epel_h8_8_neon
+    ldp         x7, x30, [sp], 16
+    ldp         x4, x5, [sp], 16
+    ldp         x0, x1, [sp], 16
+    load_epel_filterh x7, x6
+    mov         x10, 128
+    ld1         { v16.8h }, [sp], x10
+    ld1         { v17.8h }, [sp], x10
+    ld1         { v18.8h }, [sp], x10
+1:  ld1         { v19.8h }, [sp], x10
+    calc_epelh  v4, v16, v17, v18, v19
+    calc_epelh2 v4, v5, v16, v17, v18, v19
+    ld1         { v6.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v6.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v4.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v16.8h }, [sp], x10
+    calc_epelh  v4, v17, v18, v19, v16
+    calc_epelh2 v4, v5, v17, v18, v19, v16
+    ld1         { v6.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v6.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v4.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v17.8h }, [sp], x10
+    calc_epelh  v4, v18, v19, v16, v17
+    calc_epelh2 v4, v5, v18, v19, v16, v17
+    ld1         { v6.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v6.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v4.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v18.8h }, [sp], x10
+    calc_epelh  v4, v19, v16, v17, v18
+    calc_epelh2 v4, v5, v19, v16, v17, v18
+    ld1         { v6.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v6.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v4.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv12_8_neon, export=1
+    add         x10, x5, 3
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    stp         x7, x30, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3
+    mov         x2, x3
+    add         x3, x5, 3
+    mov         x4, x6
+    mov         x5, x7
+    bl          ff_hevc_put_hevc_epel_h12_8_neon
+    ldp         x7, x30, [sp], 16
+    ldp         x4, x5, [sp], 16
+    ldp         x0, x1, [sp], 16
+    load_epel_filterh x7, x6
+    sub         x1, x1, 8
+    mov         x10, 128
+    ld1         { v16.8h, v17.8h }, [sp], x10
+    ld1         { v18.8h, v19.8h }, [sp], x10
+    ld1         { v20.8h, v21.8h }, [sp], x10
+1:  ld1         { v22.8h, v23.8h }, [sp], x10
+    calc_epelh  v4, v16, v18, v20, v22
+    calc_epelh2 v4, v5, v16, v18, v20, v22
+    calc_epelh  v5, v17, v19, v21, v23
+    ld1         { v6.8h, v7.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v6.8h
+    sqadd       v5.8h, v5.8h, v7.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    sqrshrun2   v4.16b, v5.8h, 7
+    st1         { v4.8b }, [x0], 8
+    st1         { v4.s }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v16.8h, v17.8h }, [sp], x10
+    calc_epelh  v4, v18, v20, v22, v16
+    calc_epelh2 v4, v5, v18, v20, v22, v16
+    calc_epelh  v5, v19, v21, v23, v17
+    ld1         { v6.8h, v7.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v6.8h
+    sqadd       v5.8h, v5.8h, v7.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    sqrshrun2   v4.16b, v5.8h, 7
+    st1         { v4.8b }, [x0], 8
+    st1         { v4.s }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v18.8h, v19.8h }, [sp], x10
+    calc_epelh  v4, v20, v22, v16, v18
+    calc_epelh2 v4, v5, v20, v22, v16, v18
+    calc_epelh  v5, v21, v23, v17, v19
+    ld1         { v6.8h, v7.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v6.8h
+    sqadd       v5.8h, v5.8h, v7.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    sqrshrun2   v4.16b, v5.8h, 7
+    st1         { v4.8b }, [x0], 8
+    st1         { v4.s }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v20.8h, v21.8h }, [sp], x10
+    calc_epelh  v4, v22, v16, v18, v20
+    calc_epelh2 v4, v5, v22, v16, v18, v20
+    calc_epelh  v5, v23, v17, v19, v21
+    ld1         { v6.8h, v7.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v6.8h
+    sqadd       v5.8h, v5.8h, v7.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    sqrshrun2   v4.16b, v5.8h, 7
+    st1         { v4.8b }, [x0], 8
+    st1         { v4.s }[2], [x0], x1
+    subs        x5, x5, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv16_8_neon, export=1
+    add         x10, x5, 3
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    stp         x7, x30, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3
+    mov         x2, x3
+    add         x3, x5, 3
+    mov         x4, x6
+    mov         x5, x7
+    bl          ff_hevc_put_hevc_epel_h16_8_neon
+    ldp         x7, x30, [sp], 16
+    ldp         x4, x5, [sp], 16
+    ldp         x0, x1, [sp], 16
+    load_epel_filterh x7, x6
+    mov         x10, 128
+    ld1         { v16.8h, v17.8h }, [sp], x10
+    ld1         { v18.8h, v19.8h }, [sp], x10
+    ld1         { v20.8h, v21.8h }, [sp], x10
+1:  ld1         { v22.8h, v23.8h }, [sp], x10
+    calc_epelh  v4, v16, v18, v20, v22
+    calc_epelh2 v4, v5, v16, v18, v20, v22
+    calc_epelh  v5, v17, v19, v21, v23
+    calc_epelh2 v5, v6, v17, v19, v21, v23
+    ld1         { v6.8h, v7.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v6.8h
+    sqadd       v5.8h, v5.8h, v7.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    sqrshrun2   v4.16b, v5.8h, 7
+    st1         { v4.16b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v16.8h, v17.8h }, [sp], x10
+    calc_epelh  v4, v18, v20, v22, v16
+    calc_epelh2 v4, v5, v18, v20, v22, v16
+    calc_epelh  v5, v19, v21, v23, v17
+    calc_epelh2 v5, v6, v19, v21, v23, v17
+    ld1         { v6.8h, v7.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v6.8h
+    sqadd       v5.8h, v5.8h, v7.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    sqrshrun2   v4.16b, v5.8h, 7
+    st1         { v4.16b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v18.8h, v19.8h }, [sp], x10
+    calc_epelh  v4, v20, v22, v16, v18
+    calc_epelh2 v4, v5, v20, v22, v16, v18
+    calc_epelh  v5, v21, v23, v17, v19
+    calc_epelh2 v5, v6, v21, v23, v17, v19
+    ld1         { v6.8h, v7.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v6.8h
+    sqadd       v5.8h, v5.8h, v7.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    sqrshrun2   v4.16b, v5.8h, 7
+    st1         { v4.16b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v20.8h, v21.8h }, [sp], x10
+    calc_epelh  v4, v22, v16, v18, v20
+    calc_epelh2 v4, v5, v22, v16, v18, v20
+    calc_epelh  v5, v23, v17, v19, v21
+    calc_epelh2 v5, v6, v23, v17, v19, v21
+    ld1         { v6.8h, v7.8h }, [x4], x10
+    sqadd       v4.8h, v4.8h, v6.8h
+    sqadd       v5.8h, v5.8h, v7.8h
+    sqrshrun    v4.8b, v4.8h, 7
+    sqrshrun2   v4.16b, v5.8h, 7
+    st1         { v4.16b }, [x0], x1
+    subs        x5, x5, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv24_8_neon, export=1
+    add         x10, x5, 3
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    stp         x7, x30, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3
+    mov         x2, x3
+    add         x3, x5, 3
+    mov         x4, x6
+    mov         x5, x7
+    bl          ff_hevc_put_hevc_epel_h24_8_neon
+    ldp         x7, x30, [sp], 16
+    ldp         x4, x5, [sp], 16
+    ldp         x0, x1, [sp], 16
+    load_epel_filterh x7, x6
+    mov         x10, 128
+    ld1         { v16.8h, v17.8h, v18.8h }, [sp], x10
+    ld1         { v19.8h, v20.8h, v21.8h }, [sp], x10
+    ld1         { v22.8h, v23.8h, v24.8h }, [sp], x10
+1:  ld1         { v25.8h, v26.8h, v27.8h }, [sp], x10
+    calc_epelh  v1, v16, v19, v22, v25
+    calc_epelh2 v1, v2, v16, v19, v22, v25
+    calc_epelh  v2, v17, v20, v23, v26
+    calc_epelh2 v2, v3, v17, v20, v23, v26
+    calc_epelh  v3, v18, v21, v24, v27
+    calc_epelh2 v3, v4, v18, v21, v24, v27
+    ld1         { v4.8h, v5.8h, v6.8h }, [x4], x10
+    sqadd       v1.8h, v1.8h, v4.8h
+    sqadd       v2.8h, v2.8h, v5.8h
+    sqadd       v3.8h, v3.8h, v6.8h
+    sqrshrun    v1.8b, v1.8h, 7
+    sqrshrun    v2.8b, v2.8h, 7
+    sqrshrun    v3.8b, v3.8h, 7
+    st1         { v1.8b, v2.8b, v3.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v16.8h, v17.8h, v18.8h }, [sp], x10
+    calc_epelh  v1, v19, v22, v25, v16
+    calc_epelh2 v1, v2, v19, v22, v25, v16
+    calc_epelh  v2, v20, v23, v26, v17
+    calc_epelh2 v2, v3, v20, v23, v26, v17
+    calc_epelh  v3, v21, v24, v27, v18
+    calc_epelh2 v3, v4, v21, v24, v27, v18
+    ld1         { v4.8h, v5.8h, v6.8h }, [x4], x10
+    sqadd       v1.8h, v1.8h, v4.8h
+    sqadd       v2.8h, v2.8h, v5.8h
+    sqadd       v3.8h, v3.8h, v6.8h
+    sqrshrun    v1.8b, v1.8h, 7
+    sqrshrun    v2.8b, v2.8h, 7
+    sqrshrun    v3.8b, v3.8h, 7
+    st1         { v1.8b, v2.8b, v3.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v19.8h, v20.8h, v21.8h }, [sp], x10
+    calc_epelh  v1, v22, v25, v16, v19
+    calc_epelh2 v1, v2, v22, v25, v16, v19
+    calc_epelh  v2, v23, v26, v17, v20
+    calc_epelh2 v2, v3, v23, v26, v17, v20
+    calc_epelh  v3, v24, v27, v18, v21
+    calc_epelh2 v3, v4, v24, v27, v18, v21
+    ld1         { v4.8h, v5.8h, v6.8h }, [x4], x10
+    sqadd       v1.8h, v1.8h, v4.8h
+    sqadd       v2.8h, v2.8h, v5.8h
+    sqadd       v3.8h, v3.8h, v6.8h
+    sqrshrun    v1.8b, v1.8h, 7
+    sqrshrun    v2.8b, v2.8h, 7
+    sqrshrun    v3.8b, v3.8h, 7
+    st1         { v1.8b, v2.8b, v3.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v22.8h, v23.8h, v24.8h }, [sp], x10
+    calc_epelh  v1, v25, v16, v19, v22
+    calc_epelh2 v1, v2, v25, v16, v19, v22
+    calc_epelh  v2, v26, v17, v20, v23
+    calc_epelh2 v2, v3, v26, v17, v20, v23
+    calc_epelh  v3, v27, v18, v21, v24
+    calc_epelh2 v3, v4, v27, v18, v21, v24
+    ld1         { v4.8h, v5.8h, v6.8h }, [x4], x10
+    sqadd       v1.8h, v1.8h, v4.8h
+    sqadd       v2.8h, v2.8h, v5.8h
+    sqadd       v3.8h, v3.8h, v6.8h
+    sqrshrun    v1.8b, v1.8h, 7
+    sqrshrun    v2.8b, v2.8h, 7
+    sqrshrun    v3.8b, v3.8h, 7
+    st1         { v1.8b, v2.8b, v3.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv32_8_neon, export=1
+    sub         sp, sp, 16
+    st1         { v8.16b }, [sp]
+    add         x10, x5, 3
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    stp         x7, x30, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3
+    mov         x2, x3
+    add         x3, x5, 3
+    mov         x4, x6
+    mov         x5, x7
+    bl          ff_hevc_put_hevc_epel_h32_8_neon
+    ldp         x7, x30, [sp], 16
+    ldp         x4, x5, [sp], 16
+    ldp         x0, x1, [sp], 16
+    load_epel_filterh x7, x6
+    mov         x10, 128
+    ld1         { v16.8h, v17.8h, v18.8h, v19.8h }, [sp], x10
+    ld1         { v20.8h, v21.8h, v22.8h, v23.8h }, [sp], x10
+    ld1         { v24.8h, v25.8h, v26.8h, v27.8h }, [sp], x10
+1:  ld1         { v28.8h, v29.8h, v30.8h, v31.8h }, [sp], x10
+    calc_epelh  v1, v16, v20, v24, v28
+    calc_epelh2 v1, v2, v16, v20, v24, v28
+    calc_epelh  v2, v17, v21, v25, v29
+    calc_epelh2 v2, v3, v17, v21, v25, v29
+    calc_epelh  v3, v18, v22, v26, v30
+    calc_epelh2 v3, v4, v18, v22, v26, v30
+    calc_epelh  v4, v19, v23, v27, v31
+    calc_epelh2 v4, v5, v19, v23, v27, v31
+    ld1         { v5.8h, v6.8h, v7.8h, v8.8h }, [x4], x10
+    sqadd       v1.8h, v1.8h, v5.8h
+    sqadd       v2.8h, v2.8h, v6.8h
+    sqadd       v3.8h, v3.8h, v7.8h
+    sqadd       v4.8h, v4.8h, v8.8h
+    sqrshrun    v1.8b, v1.8h, 7
+    sqrshrun    v2.8b, v2.8h, 7
+    sqrshrun    v3.8b, v3.8h, 7
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v1.8b, v2.8b, v3.8b, v4.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v16.8h, v17.8h, v18.8h, v19.8h }, [sp], x10
+    calc_epelh  v1, v20, v24, v28, v16
+    calc_epelh2 v1, v2, v20, v24, v28, v16
+    calc_epelh  v2, v21, v25, v29, v17
+    calc_epelh2 v2, v3, v21, v25, v29, v17
+    calc_epelh  v3, v22, v26, v30, v18
+    calc_epelh2 v3, v4, v22, v26, v30, v18
+    calc_epelh  v4, v23, v27, v31, v19
+    calc_epelh2 v4, v5, v23, v27, v31, v19
+    ld1         { v5.8h, v6.8h, v7.8h, v8.8h }, [x4], x10
+    sqadd       v1.8h, v1.8h, v5.8h
+    sqadd       v2.8h, v2.8h, v6.8h
+    sqadd       v3.8h, v3.8h, v7.8h
+    sqadd       v4.8h, v4.8h, v8.8h
+    sqrshrun    v1.8b, v1.8h, 7
+    sqrshrun    v2.8b, v2.8h, 7
+    sqrshrun    v3.8b, v3.8h, 7
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v1.8b, v2.8b, v3.8b, v4.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v20.8h, v21.8h, v22.8h, v23.8h }, [sp], x10
+    calc_epelh  v1, v24, v28, v16, v20
+    calc_epelh2 v1, v2, v24, v28, v16, v20
+    calc_epelh  v2, v25, v29, v17, v21
+    calc_epelh2 v2, v3, v25, v29, v17, v21
+    calc_epelh  v3, v26, v30, v18, v22
+    calc_epelh2 v3, v4, v26, v30, v18, v22
+    calc_epelh  v4, v27, v31, v19, v23
+    calc_epelh2 v4, v5, v27, v31, v19, v23
+    ld1         { v5.8h, v6.8h, v7.8h, v8.8h }, [x4], x10
+    sqadd       v1.8h, v1.8h, v5.8h
+    sqadd       v2.8h, v2.8h, v6.8h
+    sqadd       v3.8h, v3.8h, v7.8h
+    sqadd       v4.8h, v4.8h, v8.8h
+    sqrshrun    v1.8b, v1.8h, 7
+    sqrshrun    v2.8b, v2.8h, 7
+    sqrshrun    v3.8b, v3.8h, 7
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v1.8b, v2.8b, v3.8b, v4.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v24.8h, v25.8h, v26.8h, v27.8h }, [sp], x10
+    calc_epelh  v1, v28, v16, v20, v24
+    calc_epelh2 v1, v2, v28, v16, v20, v24
+    calc_epelh  v2, v29, v17, v21, v25
+    calc_epelh2 v2, v3, v29, v17, v21, v25
+    calc_epelh  v3, v30, v18, v22, v26
+    calc_epelh2 v3, v4, v30, v18, v22, v26
+    calc_epelh  v4, v31, v19, v23, v27
+    calc_epelh2 v4, v5, v31, v19, v23, v27
+    ld1         { v5.8h, v6.8h, v7.8h, v8.8h }, [x4], x10
+    sqadd       v1.8h, v1.8h, v5.8h
+    sqadd       v2.8h, v2.8h, v6.8h
+    sqadd       v3.8h, v3.8h, v7.8h
+    sqadd       v4.8h, v4.8h, v8.8h
+    sqrshrun    v1.8b, v1.8h, 7
+    sqrshrun    v2.8b, v2.8h, 7
+    sqrshrun    v3.8b, v3.8h, 7
+    sqrshrun    v4.8b, v4.8h, 7
+    st1         { v1.8b, v2.8b, v3.8b, v4.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.ne        1b
+2:  ld1         { v8.16b }, [sp], 16
+    ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv48_8_neon, export=1
+    stp         xzr, x30, [sp, -16]!
+    stp         x0, x1, [sp, -16]!
+    stp         x2, x3, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    stp         x6, x7, [sp, -16]!
+    bl          ff_hevc_put_hevc_epel_bi_hv24_8_neon
+    ldp         x6, x7, [sp], 16
+    ldp         x4, x5, [sp], 16
+    ldp         x2, x3, [sp], 16
+    ldp         x0, x1, [sp], 16
+    add         x0, x0, 24
+    add         x2, x2, 24
+    add         x4, x4, 48
+    bl          ff_hevc_put_hevc_epel_bi_hv24_8_neon
+    ldp         xzr, x30, [sp], 16
+    ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv64_8_neon, export=1
+    stp         xzr, x30, [sp, -16]!
+    stp         x0, x1, [sp, -16]!
+    stp         x2, x3, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    stp         x6, x7, [sp, -16]!
+    bl          ff_hevc_put_hevc_epel_bi_hv32_8_neon
+    ldp         x6, x7, [sp], 16
+    ldp         x4, x5, [sp], 16
+    ldp         x2, x3, [sp], 16
+    ldp         x0, x1, [sp], 16
+    add         x0, x0, 32
+    add         x2, x2, 32
+    add         x4, x4, 64
+    bl          ff_hevc_put_hevc_epel_bi_hv32_8_neon
+    ldp         xzr, x30, [sp], 16
+    ret
+endfunc
+
+
+
diff --git a/libavcodec/aarch64/hevcdsp_idct_8.S b/libavcodec/aarch64/hevcdsp_idct_8.S
new file mode 100644
index 0000000000..d251a52e3d
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_idct_8.S
@@ -0,0 +1,1980 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+
+.Lo0_coeff:
+    .hword      83, 36, 0, 0, 0, 0, 0, 0
+.Lo8transform0:
+    .hword      89,  75,  50,  18               // transform[4,12,20,28][0]
+.Lo8transform1:
+    .hword      75, -18, -89, -50
+.Lo8transform2:
+    .hword      50, -89,  18,  75
+.Lo8transform3:
+    .hword      18, -50,  75, -89
+
+.LimitMask:
+    .hword  0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0
+    .hword  0xffff,      0,      0,      0,      0, 0, 0, 0
+
+.Leo_coeff:
+    .hword      64,  64,  64,  64,  83,  36, -36, -83
+    .hword      64, -64, -64,  64,  36, -83,  83, -36
+    .hword      89,  75,  50,  18,  75, -18, -89, -50   // transform[4,12][0-3]
+    .hword      50, -89,  18,  75,  18, -50,  75, -89   // transform[20,28][0-3]
+.Lo16transform0:    // transform[2][0-7], also transform[2,6,10..][0]
+    .hword      90,  87,  80,  70,  57,  43,  25,   9   
+.Lo16transform1:
+    .hword      87,  57,   9, -43, -80, -90, -70, -25   // transform[6][0-7]
+.Lo16transform2:
+    .hword      80,   9, -70, -87, -25,  57,  90,  43   // transform[10][0-7]
+.Lo16transform3:
+    .hword      70, -43, -87,   9,  90,  25, -80, -57   // transform[14][0-7]
+.Lo16transform4:
+    .hword      57, -80, -25,  90,  -9, -87,  43,  70   // transform[18][0-7]
+.Lo16transform5:
+    .hword      43, -90,  57,  25, -87,  70,   9, -80   // transform[22][0-7]
+.Lo16transform6:
+    .hword      25, -70,  90, -80,  43,   9, -57,  87   // transform[26][0-7]
+.Lo16transform7:
+    .hword       9, -25,  43, -57,  70, -80,  87, -90   // transform[30][0-7]
+.Lo32transform:
+    .hword  90,  90,  88,  85,  82,  78,  73,  67   // transform[1,3,5,7..15][1]
+    .hword  61,  54,  46,  38,  31,  22,  13,   4   // transform[17,19,21..31][1]
+    .hword  90,  82,  67,  46,  22,  -4, -31, -54   // transform[1,3,5,7..15][3]
+    .hword -73, -85, -90, -88, -78, -61, -38, -13   // transform[17,19,21..31][3]
+    .hword  88,  67,  31, -13, -54, -82, -90, -78   // ..
+    .hword -46, -4,   38,  73,  90,  85,  61,  22
+    .hword  85,  46, -13, -67, -90, -73, -22,  38
+    .hword  82,  88,  54,  -4, -61, -90, -78, -31
+.Lo32transform9_31:
+    .hword  82,  22, -54, -90, -61,  13,  78,  85
+    .hword  31, -46, -90, -67,   4,  73,  88,  38
+    .hword  78,  -4, -82, -73,  13,  85,  67, -22
+    .hword -88, -61,  31,  90,  54, -38, -90, -46
+    .hword  73, -31, -90, -22,  78,  67, -38, -90
+    .hword -13,  82,  61, -46, -88,  -4,  85,  54
+    .hword  67, -54, -78,  38,  85, -22, -90,   4
+    .hword  90,  13, -88, -31,  82,  46, -73, -61
+    .hword  61, -73, -46,  82,  31, -88, -13,  90
+    .hword  -4, -90,  22,  85, -38, -78,  54,  67
+    .hword  54, -85,  -4,  88, -46, -61,  82,  13
+    .hword -90,  38,  67, -78, -22,  90, -31, -73
+    .hword  46, -90,  38,  54, -90,  31,  61, -88
+    .hword  22,  67, -85,  13,  73, -82,   4,  78
+    .hword  38, -88,  73,  -4, -67,  90, -46, -31
+    .hword  85, -78,  13,  61, -90,  54,  22, -82
+    .hword  31, -78,  90, -61,   4,  54, -88,  82
+    .hword -38, -22,  73, -90,  67, -13, -46,  85
+    .hword  22, -61,  85, -90,  73, -38,  -4,  46
+    .hword -78,  90, -82,  54, -13, -31,  67, -88
+    .hword  13, -38,  61, -78,  88, -90,  85, -73
+    .hword  54, -31,   4,  22, -46,  67, -82,  90
+    .hword   4, -13,  22, -31,  38, -46,  54, -61   // transform[1,3,5,7..15][31]
+    .hword  67, -73,  78, -82,  85, -88,  90, -90   // transform[17,19,21..31][31]
+
+
+// void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit)
+function ff_hevc_idct_16x16_8_neon, export=1
+    sub         sp, sp, 64
+    st1         { v8.16b, v9.16b, v10.16b, v11.16b }, [sp]
+    sub         sp, sp, 32
+    st1         { v14.16b, v15.16b }, [sp]
+    mov         x3, 0
+    mov         x2, x0
+1:  mov         x4, x2
+    mov         x5, 32
+    ld1         { v16.8h }, [x4], x5
+    ld1         { v17.8h }, [x4], x5
+    ld1         { v18.8h }, [x4], x5
+    ld1         { v19.8h }, [x4], x5
+    ld1         { v20.8h }, [x4], x5
+    ld1         { v21.8h }, [x4], x5
+    ld1         { v22.8h }, [x4], x5
+    ld1         { v23.8h }, [x4], x5
+    ld1         { v24.8h }, [x4], x5
+    ld1         { v25.8h }, [x4], x5
+    ld1         { v26.8h }, [x4], x5
+    ld1         { v27.8h }, [x4], x5
+    ld1         { v28.8h }, [x4], x5
+    ld1         { v29.8h }, [x4], x5
+    ld1         { v30.8h }, [x4], x5
+    ld1         { v31.8h }, [x4], x5
+    cmp         x1, 12
+    b.hs        5f
+    // limit2 below 16
+    bic         x4, x1, 1
+    adr         x5, .LimitMask
+    cbnz        x3, 3f
+    // columns 0 .. 7 - cleanup of indexes 5 .. 7
+    ld1         { v0.8h }, [x5]
+    adr         x5, 2f
+    add         x5, x5, x4, lsl 2
+    add         x5, x5, x4, lsl 1
+    br          x5
+2:  and         v17.16b, v17.16b, v0.16b    // col_limit 0..1 -> limit2 == 4..5
+    and         v19.16b, v19.16b, v0.16b
+    b           5f
+    and         v19.16b, v19.16b, v0.16b    // col_limit 2..3 -> limit2 == 6..7
+    and         v21.16b, v21.16b, v0.16b
+    b           5f
+    and         v21.16b, v21.16b, v0.16b    // col_limit 4..5 -> limit2 == 8..9
+    and         v23.16b, v23.16b, v0.16b
+    b           5f
+    and         v23.16b, v23.16b, v0.16b    // col_limit 6..7 -> limit2 == 10..11
+    and         v25.16b, v25.16b, v0.16b
+    b           5f
+    and         v25.16b, v25.16b, v0.16b    // col_limit 8..9 -> limit2 == 12..13
+    and         v27.16b, v27.16b, v0.16b
+    b           5f
+    and         v27.16b, v27.16b, v0.16b    // col_limit 10..11 -> limit2 == 14..15
+    and         v29.16b, v29.16b, v0.16b
+    b           5f
+    // columns 8 .. 15
+3:  subs        x4, x4, 2
+    b.lo        5f
+    ld1         { v0.8h, v1.8h }, [x5]
+    adr         x5, 4f
+    add         x5, x5, x4, lsl 3
+    add         x5, x5, x4, lsl 1
+    br          x5
+4:  and         v17.16b, v17.16b, v1.16b    // col_limit 2..3 -> limit2 == 2..3
+    b           5f
+    nop
+    nop
+    nop
+    and         v17.16b, v17.16b, v1.16b    // col_limit 4..5 -> limit2 == 4..5
+    and         v19.16b, v19.16b, v1.16b
+    b           5f
+    nop
+    nop
+    and         v17.16b, v17.16b, v0.16b    // col_limit 6..7 -> limit2 == 6..7
+    and         v19.16b, v19.16b, v1.16b
+    and         v21.16b, v21.16b, v1.16b
+    b           5f
+    nop
+    and         v17.16b, v17.16b, v0.16b    // col_limit 8..9 -> limit2 == 8..9
+    and         v19.16b, v19.16b, v0.16b
+    and         v21.16b, v21.16b, v1.16b
+    and         v23.16b, v23.16b, v1.16b
+    b           5f
+    and         v19.16b, v19.16b, v0.16b    // col_limit 10..11 -> limit2 == 10..11
+    and         v21.16b, v21.16b, v0.16b
+    and         v23.16b, v23.16b, v1.16b
+    and         v25.16b, v25.16b, v1.16b
+    b           5f
+5:  adr         x4, .Lo0_coeff
+    ld1         { v14.8h }, [x4]
+
+    // v0,v1 = e0
+    sshll       v0.4s, v16.4h, 6
+    sshll       v1.4s, v24.4h, 6
+    add         v0.4s, v0.4s, v1.4s
+    sshll2      v1.4s, v16.8h, 6
+    sshll2      v2.4s, v24.8h, 6
+    add         v1.4s, v1.4s, v2.4s
+
+    // v2,v3 = o0
+    smull       v2.4s, v20.4h, v14.h[0]
+    smlal       v2.4s, v28.4h, v14.h[1]
+    smull2      v3.4s, v20.8h, v14.h[0]
+    smlal2      v3.4s, v28.8h, v14.h[1]
+
+    // v4,v5 = e_8[0]
+    add         v4.4s, v0.4s, v2.4s
+    add         v5.4s, v1.4s, v3.4s
+
+    // v6,v7 = e_8[3]
+    sub         v6.4s, v0.4s, v2.4s
+    sub         v7.4s, v1.4s, v3.4s
+
+
+    // v0,v1 = o_8[0]
+    adr         x4, .Lo8transform0
+    ld1         { v15.4h }, [x4]
+    smull       v0.4s, v18.4h, v15.h[0]
+    smlal       v0.4s, v22.4h, v15.h[1]
+    smlal       v0.4s, v26.4h, v15.h[2]
+    smlal       v0.4s, v30.4h, v15.h[3]
+    smull2      v1.4s, v18.8h, v15.h[0]
+    smlal2      v1.4s, v22.8h, v15.h[1]
+    smlal2      v1.4s, v26.8h, v15.h[2]
+    smlal2      v1.4s, v30.8h, v15.h[3]
+
+    // v2,v3 = e_16[0]
+    add         v2.4s, v4.4s, v0.4s
+    add         v3.4s, v5.4s, v1.4s
+
+    // v8,v9 = o_16[0]
+    adr         x4, .Lo16transform0
+    ld1         { v15.8h }, [x4]
+
+    mov         x5, 16
+    cmp         x1, 12
+    b.hs        6f
+    add         x5, x1, 4
+    bic         x5, x5, 1
+    cbz         x3, 6f
+    orr         x5, x1, 1
+    subs        x5, x5, 2
+    csel        x5, x5, xzr, hs
+6:  mov         x4, 64
+    sub         x6, x4, x5, lsl 2
+    adr         x5, 7f
+    add         x5, x5, x6
+    movi        v8.4s, 0
+    movi        v9.4s, 0
+    br          x5
+7:  smlal2      v9.4s, v31.8h, v15.h[7]
+    smlal       v8.4s, v31.4h, v15.h[7]
+    smlal2      v9.4s, v29.8h, v15.h[6]
+    smlal       v8.4s, v29.4h, v15.h[6]
+    smlal2      v9.4s, v27.8h, v15.h[5]
+    smlal       v8.4s, v27.4h, v15.h[5]
+    smlal2      v9.4s, v25.8h, v15.h[4]
+    smlal       v8.4s, v25.4h, v15.h[4]
+    smlal2      v9.4s, v23.8h, v15.h[3]
+    smlal       v8.4s, v23.4h, v15.h[3]
+    smlal2      v9.4s, v21.8h, v15.h[2]
+    smlal       v8.4s, v21.4h, v15.h[2]
+    smlal2      v9.4s, v19.8h, v15.h[1]
+    smlal       v8.4s, v19.4h, v15.h[1]
+    smlal2      v9.4s, v17.8h, v15.h[0]
+    smlal       v8.4s, v17.4h, v15.h[0]
+
+    // tmp[0 * 16]
+    add         v10.4s, v2.4s, v8.4s
+    add         v11.4s, v3.4s, v9.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    st1         { v10.8h }, [x2]
+
+    // tmp[15 * 16]
+    sub         v10.4s, v2.4s, v8.4s
+    sub         v11.4s, v3.4s, v9.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 15 * 32
+    st1         { v10.8h }, [x4]
+
+    // v2,v3 = e_16[7]
+    sub         v2.4s, v4.4s, v0.4s
+    sub         v3.4s, v5.4s, v1.4s
+
+    // v8,v9 = o_16[7]
+    adr         x4, .Lo16transform7
+    ld1         { v15.8h }, [x4]
+    adr         x5, 8f
+    add         x5, x5, x6
+    movi        v8.4s, 0
+    movi        v9.4s, 0
+    br          x5
+8:  smlal2      v9.4s, v31.8h, v15.h[7]
+    smlal       v8.4s, v31.4h, v15.h[7]
+    smlal2      v9.4s, v29.8h, v15.h[6]
+    smlal       v8.4s, v29.4h, v15.h[6]
+    smlal2      v9.4s, v27.8h, v15.h[5]
+    smlal       v8.4s, v27.4h, v15.h[5]
+    smlal2      v9.4s, v25.8h, v15.h[4]
+    smlal       v8.4s, v25.4h, v15.h[4]
+    smlal2      v9.4s, v23.8h, v15.h[3]
+    smlal       v8.4s, v23.4h, v15.h[3]
+    smlal2      v9.4s, v21.8h, v15.h[2]
+    smlal       v8.4s, v21.4h, v15.h[2]
+    smlal2      v9.4s, v19.8h, v15.h[1]
+    smlal       v8.4s, v19.4h, v15.h[1]
+    smlal2      v9.4s, v17.8h, v15.h[0]
+    smlal       v8.4s, v17.4h, v15.h[0]
+
+    // tmp[7 * 16]
+    add         v10.4s, v2.4s, v8.4s
+    add         v11.4s, v3.4s, v9.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 7 * 32
+    st1         { v10.8h }, [x4]
+
+    // tmp[8 * 16]
+    sub         v10.4s, v2.4s, v8.4s
+    sub         v11.4s, v3.4s, v9.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 8 * 32
+    st1         { v10.8h }, [x4]
+
+    // v0,v1 = o_8[3]
+    adr         x4, .Lo8transform3
+    ld1         { v15.4h }, [x4]
+    smull       v0.4s, v18.4h, v15.h[0]
+    smlal       v0.4s, v22.4h, v15.h[1]
+    smlal       v0.4s, v26.4h, v15.h[2]
+    smlal       v0.4s, v30.4h, v15.h[3]
+    smull2      v1.4s, v18.8h, v15.h[0]
+    smlal2      v1.4s, v22.8h, v15.h[1]
+    smlal2      v1.4s, v26.8h, v15.h[2]
+    smlal2      v1.4s, v30.8h, v15.h[3]
+
+    // v2,v3 = e_16[3]
+    add         v2.4s, v6.4s, v0.4s
+    add         v3.4s, v7.4s, v1.4s
+
+    // v8,v9 = o_16[3]
+    adr         x4, .Lo16transform3
+    ld1         { v15.8h }, [x4]
+    adr         x5, 9f
+    add         x5, x5, x6
+    movi        v8.4s, 0
+    movi        v9.4s, 0
+    br          x5
+9:  smlal2      v9.4s, v31.8h, v15.h[7]
+    smlal       v8.4s, v31.4h, v15.h[7]
+    smlal2      v9.4s, v29.8h, v15.h[6]
+    smlal       v8.4s, v29.4h, v15.h[6] // 13
+    smlal2      v9.4s, v27.8h, v15.h[5]
+    smlal       v8.4s, v27.4h, v15.h[5] // 11
+    smlal2      v9.4s, v25.8h, v15.h[4]
+    smlal       v8.4s, v25.4h, v15.h[4] // 9
+    smlal2      v9.4s, v23.8h, v15.h[3]
+    smlal       v8.4s, v23.4h, v15.h[3] // 7
+    smlal2      v9.4s, v21.8h, v15.h[2]
+    smlal       v8.4s, v21.4h, v15.h[2] // 5
+    smlal2      v9.4s, v19.8h, v15.h[1]
+    smlal       v8.4s, v19.4h, v15.h[1] // 3
+    smlal2      v9.4s, v17.8h, v15.h[0]
+    smlal       v8.4s, v17.4h, v15.h[0] // 1
+
+    // tmp[3 * 16]
+    add         v10.4s, v2.4s, v8.4s
+    add         v11.4s, v3.4s, v9.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 3 * 32
+    st1         { v10.8h }, [x4]
+
+    // tmp[12 * 16]
+    sub         v10.4s, v2.4s, v8.4s
+    sub         v11.4s, v3.4s, v9.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 12 * 32
+    st1         { v10.8h }, [x4]
+
+    // v2,v3 = e_16[4]
+    sub         v2.4s, v6.4s, v0.4s
+    sub         v3.4s, v7.4s, v1.4s
+
+    // v8,v9 = o_16[4]
+    adr         x4, .Lo16transform4
+    ld1         { v15.8h }, [x4]
+    adr         x5, 10f
+    add         x5, x5, x6
+    movi        v8.4s, 0
+    movi        v9.4s, 0
+    br          x5
+10: smlal2      v9.4s, v31.8h, v15.h[7]
+    smlal       v8.4s, v31.4h, v15.h[7]
+    smlal2      v9.4s, v29.8h, v15.h[6]
+    smlal       v8.4s, v29.4h, v15.h[6]
+    smlal2      v9.4s, v27.8h, v15.h[5]
+    smlal       v8.4s, v27.4h, v15.h[5]
+    smlal2      v9.4s, v25.8h, v15.h[4]
+    smlal       v8.4s, v25.4h, v15.h[4]
+    smlal2      v9.4s, v23.8h, v15.h[3]
+    smlal       v8.4s, v23.4h, v15.h[3]
+    smlal2      v9.4s, v21.8h, v15.h[2]
+    smlal       v8.4s, v21.4h, v15.h[2]
+    smlal2      v9.4s, v19.8h, v15.h[1]
+    smlal       v8.4s, v19.4h, v15.h[1]
+    smlal2      v9.4s, v17.8h, v15.h[0]
+    smlal       v8.4s, v17.4h, v15.h[0]
+
+    // tmp[4 * 16]
+    add         v10.4s, v2.4s, v8.4s
+    add         v11.4s, v3.4s, v9.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 4 * 32
+    st1         { v10.8h }, [x4]
+
+    // tmp[11 * 16]
+    sub         v10.4s, v2.4s, v8.4s
+    sub         v11.4s, v3.4s, v9.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 11 * 32
+    st1         { v10.8h }, [x4]
+
+
+    // v0,v1 = e1
+    sshll       v0.4s, v16.4h, 6
+    sshll       v1.4s, v24.4h, 6
+    sub         v0.4s, v0.4s, v1.4s
+    sshll2      v1.4s, v16.8h, 6
+    sshll2      v2.4s, v24.8h, 6
+    sub         v1.4s, v1.4s, v2.4s
+
+    // v2,v3 = o1
+    smull       v2.4s, v20.4h, v14.h[1]
+    smlsl       v2.4s, v28.4h, v14.h[0]
+    smull2      v3.4s, v20.8h, v14.h[1]
+    smlsl2      v3.4s, v28.8h, v14.h[0]
+
+    // v4,v5 = e_8[1]
+    add         v4.4s, v0.4s, v2.4s
+    add         v5.4s, v1.4s, v3.4s
+
+    // v6,v7 = e_8[2]
+    sub         v6.4s, v0.4s, v2.4s
+    sub         v7.4s, v1.4s, v3.4s
+
+    // v0,v1 = o_8[1]
+    adr         x4, .Lo8transform1
+    ld1         { v15.4h }, [x4]
+    smull       v0.4s, v18.4h, v15.h[0]
+    smlal       v0.4s, v22.4h, v15.h[1]
+    smlal       v0.4s, v26.4h, v15.h[2]
+    smlal       v0.4s, v30.4h, v15.h[3]
+    smull2      v1.4s, v18.8h, v15.h[0]
+    smlal2      v1.4s, v22.8h, v15.h[1]
+    smlal2      v1.4s, v26.8h, v15.h[2]
+    smlal2      v1.4s, v30.8h, v15.h[3]
+
+    // v2,v3 = e_16[1]
+    add         v2.4s, v4.4s, v0.4s
+    add         v3.4s, v5.4s, v1.4s
+
+    // v8,v9 = o_16[1]
+    adr         x4, .Lo16transform1
+    ld1         { v15.8h }, [x4]
+    adr         x5, 11f
+    add         x5, x5, x6
+    movi        v8.4s, 0
+    movi        v9.4s, 0
+    br          x5
+11: smlal2      v9.4s, v31.8h, v15.h[7]
+    smlal       v8.4s, v31.4h, v15.h[7]
+    smlal2      v9.4s, v29.8h, v15.h[6]
+    smlal       v8.4s, v29.4h, v15.h[6]
+    smlal2      v9.4s, v27.8h, v15.h[5]
+    smlal       v8.4s, v27.4h, v15.h[5]
+    smlal2      v9.4s, v25.8h, v15.h[4]
+    smlal       v8.4s, v25.4h, v15.h[4]
+    smlal2      v9.4s, v23.8h, v15.h[3]
+    smlal       v8.4s, v23.4h, v15.h[3]
+    smlal2      v9.4s, v21.8h, v15.h[2]
+    smlal       v8.4s, v21.4h, v15.h[2]
+    smlal2      v9.4s, v19.8h, v15.h[1]
+    smlal       v8.4s, v19.4h, v15.h[1]
+    smlal2      v9.4s, v17.8h, v15.h[0]
+    smlal       v8.4s, v17.4h, v15.h[0]
+
+    // tmp[1 * 16]
+    add         v10.4s, v2.4s, v8.4s
+    add         v11.4s, v3.4s, v9.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 1 * 32
+    st1         { v10.8h }, [x4]
+
+    // tmp[14 * 16]
+    sub         v10.4s, v2.4s, v8.4s
+    sub         v11.4s, v3.4s, v9.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 14 * 32
+    st1         { v10.8h }, [x4]
+
+    // v2,v3 = e_16[6]
+    sub         v2.4s, v4.4s, v0.4s
+    sub         v3.4s, v5.4s, v1.4s
+
+    // v8,v9 = o_16[6]
+    adr         x4, .Lo16transform6
+    ld1         { v15.8h }, [x4]
+    adr         x5, 12f
+    add         x5, x5, x6
+    movi        v8.4s, 0
+    movi        v9.4s, 0
+    br          x5
+12: smlal2      v9.4s, v31.8h, v15.h[7]
+    smlal       v8.4s, v31.4h, v15.h[7]
+    smlal2      v9.4s, v29.8h, v15.h[6]
+    smlal       v8.4s, v29.4h, v15.h[6]
+    smlal2      v9.4s, v27.8h, v15.h[5]
+    smlal       v8.4s, v27.4h, v15.h[5]
+    smlal2      v9.4s, v25.8h, v15.h[4]
+    smlal       v8.4s, v25.4h, v15.h[4]
+    smlal2      v9.4s, v23.8h, v15.h[3]
+    smlal       v8.4s, v23.4h, v15.h[3]
+    smlal2      v9.4s, v21.8h, v15.h[2]
+    smlal       v8.4s, v21.4h, v15.h[2]
+    smlal2      v9.4s, v19.8h, v15.h[1]
+    smlal       v8.4s, v19.4h, v15.h[1]
+    smlal2      v9.4s, v17.8h, v15.h[0]
+    smlal       v8.4s, v17.4h, v15.h[0]
+
+    // tmp[6 * 16]
+    add         v10.4s, v2.4s, v8.4s
+    add         v11.4s, v3.4s, v9.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 6 * 32
+    st1         { v10.8h }, [x4]
+
+    // tmp[9 * 16]
+    sub         v10.4s, v2.4s, v8.4s
+    sub         v11.4s, v3.4s, v9.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 9 * 32
+    st1         { v10.8h }, [x4]
+
+    // v0,v1 = o_8[2]
+    adr         x4, .Lo8transform2
+    ld1         { v15.4h }, [x4]
+    smull       v0.4s, v18.4h, v15.h[0]
+    smlal       v0.4s, v22.4h, v15.h[1]
+    smlal       v0.4s, v26.4h, v15.h[2]
+    smlal       v0.4s, v30.4h, v15.h[3]
+    smull2      v1.4s, v18.8h, v15.h[0]
+    smlal2      v1.4s, v22.8h, v15.h[1]
+    smlal2      v1.4s, v26.8h, v15.h[2]
+    smlal2      v1.4s, v30.8h, v15.h[3]
+
+    // v2,v3 = e_16[2]
+    add         v2.4s, v6.4s, v0.4s
+    add         v3.4s, v7.4s, v1.4s
+
+    // v8,v9 = o_16[2]
+    adr         x4, .Lo16transform2
+    ld1         { v15.8h }, [x4]
+    adr         x5, 13f
+    add         x5, x5, x6
+    movi        v8.4s, 0
+    movi        v9.4s, 0
+    br          x5
+13: smlal2      v9.4s, v31.8h, v15.h[7]
+    smlal       v8.4s, v31.4h, v15.h[7]
+    smlal2      v9.4s, v29.8h, v15.h[6]
+    smlal       v8.4s, v29.4h, v15.h[6]
+    smlal2      v9.4s, v27.8h, v15.h[5]
+    smlal       v8.4s, v27.4h, v15.h[5]
+    smlal2      v9.4s, v25.8h, v15.h[4]
+    smlal       v8.4s, v25.4h, v15.h[4]
+    smlal2      v9.4s, v23.8h, v15.h[3]
+    smlal       v8.4s, v23.4h, v15.h[3]
+    smlal2      v9.4s, v21.8h, v15.h[2]
+    smlal       v8.4s, v21.4h, v15.h[2]
+    smlal2      v9.4s, v19.8h, v15.h[1]
+    smlal       v8.4s, v19.4h, v15.h[1]
+    smlal2      v9.4s, v17.8h, v15.h[0]
+    smlal       v8.4s, v17.4h, v15.h[0]
+
+    // tmp[2 * 16]
+    add         v10.4s, v2.4s, v8.4s
+    add         v11.4s, v3.4s, v9.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 2 * 32
+    st1         { v10.8h }, [x4]
+
+    // tmp[13 * 16]
+    sub         v10.4s, v2.4s, v8.4s
+    sub         v11.4s, v3.4s, v9.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 13 * 32
+    st1         { v10.8h }, [x4]
+
+    // v2,v3 = e_16[5]
+    sub         v2.4s, v6.4s, v0.4s
+    sub         v3.4s, v7.4s, v1.4s
+
+    // v8,v9 = o_16[5]
+    adr         x4, .Lo16transform5
+    ld1         { v15.8h }, [x4]
+    adr         x5, 14f
+    add         x5, x5, x6
+    movi        v8.4s, 0
+    movi        v9.4s, 0
+    br          x5
+14: smlal2      v9.4s, v31.8h, v15.h[7]
+    smlal       v8.4s, v31.4h, v15.h[7]
+    smlal2      v9.4s, v29.8h, v15.h[6]
+    smlal       v8.4s, v29.4h, v15.h[6]
+    smlal2      v9.4s, v27.8h, v15.h[5]
+    smlal       v8.4s, v27.4h, v15.h[5]
+    smlal2      v9.4s, v25.8h, v15.h[4]
+    smlal       v8.4s, v25.4h, v15.h[4]
+    smlal2      v9.4s, v23.8h, v15.h[3]
+    smlal       v8.4s, v23.4h, v15.h[3]
+    smlal2      v9.4s, v21.8h, v15.h[2]
+    smlal       v8.4s, v21.4h, v15.h[2]
+    smlal2      v9.4s, v19.8h, v15.h[1]
+    smlal       v8.4s, v19.4h, v15.h[1]
+    smlal2      v9.4s, v17.8h, v15.h[0]
+    smlal       v8.4s, v17.4h, v15.h[0]
+
+    // tmp[5 * 16]
+    add         v10.4s, v2.4s, v8.4s
+    add         v11.4s, v3.4s, v9.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 5 * 32
+    st1         { v10.8h }, [x4]
+
+    // tmp[10 * 16]
+    sub         v10.4s, v2.4s, v8.4s
+    sub         v11.4s, v3.4s, v9.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 10 * 32
+    st1         { v10.8h }, [x4]
+
+    add         x2, x2, 16
+    add         x3, x3, 1
+    cmp         x3, 2
+    b.lo        1b
+
+
+    // horizontal transform
+    adr         x4, .Leo_coeff
+    ld1         { v16.8h, v17.8h, v18.8h, v19.8h }, [x4], 64
+    ld1         { v20.8h, v21.8h, v22.8h, v23.8h }, [x4], 64
+    ld1         { v24.8h, v25.8h, v26.8h, v27.8h }, [x4], 64
+    // o_16 jump address
+    mov         x4, 64
+    bic         x5, x1, 1
+    subs        x4, x4, x5, lsl 2
+    csel        x4, x4, xzr, hs
+    adr         x5, 15f
+    add         x5, x5, x4
+
+    mov         x3, 16
+14: ld1         { v6.8h, v7.8h }, [x0]
+
+    // v2 = e_8
+    smull       v2.4s, v16.4h, v6.h[0]
+    smlal2      v2.4s, v16.8h, v6.h[4]
+    smlal       v2.4s, v17.4h, v7.h[0]
+    smlal2      v2.4s, v17.8h, v7.h[4]
+
+    // v3 = o_8
+    smull       v3.4s, v18.4h, v6.h[2]
+    smlal2      v3.4s, v18.8h, v6.h[6]
+    smlal       v3.4s, v19.4h, v7.h[2]
+    smlal2      v3.4s, v19.8h, v7.h[6]
+
+    // v0,v1 = e_16
+    add         v0.4s, v2.4s, v3.4s
+    sub         v2.4s, v2.4s, v3.4s
+    mov         v1.d[0], v2.d[1]
+    mov         v1.d[1], v2.d[0]
+    rev64       v1.4s, v1.4s
+
+    // v2,v3 = o_16
+    movi        v2.4s, 0
+    movi        v3.4s, 0
+    br          x5
+15: smlal       v2.4s, v27.4h, v7.h[7]
+    smlal2      v3.4s, v27.8h, v7.h[7]
+    smlal       v2.4s, v26.4h, v7.h[5]
+    smlal2      v3.4s, v26.8h, v7.h[5]
+    smlal       v2.4s, v25.4h, v7.h[3]
+    smlal2      v3.4s, v25.8h, v7.h[3]
+    smlal       v2.4s, v24.4h, v7.h[1]
+    smlal2      v3.4s, v24.8h, v7.h[1]
+    smlal       v2.4s, v23.4h, v6.h[7]
+    smlal2      v3.4s, v23.8h, v6.h[7]
+    smlal       v2.4s, v22.4h, v6.h[5]
+    smlal2      v3.4s, v22.8h, v6.h[5]
+    smlal       v2.4s, v21.4h, v6.h[3]
+    smlal2      v3.4s, v21.8h, v6.h[3]
+    smlal       v2.4s, v20.4h, v6.h[1]
+    smlal2      v3.4s, v20.8h, v6.h[1]
+
+    // coeff
+    add         v4.4s, v0.4s, v2.4s
+    add         v5.4s, v1.4s, v3.4s
+    sub         v6.4s, v0.4s, v2.4s
+    sub         v7.4s, v1.4s, v3.4s
+    sqrshrn     v4.4h, v4.4s, 12
+    sqrshrn2    v4.8h, v5.4s, 12
+    sqrshrn     v6.4h, v6.4s, 12
+    sqrshrn2    v6.8h, v7.4s, 12
+    mov         v5.d[0], v6.d[1]
+    mov         v5.d[1], v6.d[0]
+    rev64       v5.8h, v5.8h
+    st1         { v4.8h, v5.8h }, [x0], 32
+    subs        x3, x3, 1
+    b.ne        14b
+
+    ld1         { v14.16b, v15.16b }, [sp], 32
+    ld1         { v8.16b, v9.16b, v10.16b, v11.16b }, [sp], 64
+    ret
+endfunc
+
+
+
+function ff_hevc_idct_32x32_8_neon, export=1
+    sub         sp, sp, 64
+    st1         { v8.16b, v9.16b, v10.16b, v11.16b }, [sp]
+    sub         sp, sp, 64
+    st1         { v12.16b, v13.16b, v14.16b, v15.16b }, [sp]
+    sub         sp, sp, 16 * 32 * 4     // room for o_32: 16 * 32 values
+    mov         x3, 0                   // loop counter
+    mov         x2, x0
+    mov         x7, 83
+    add         x7, x7, 36 * 65536      // o0, o1 coeff. factors
+1:  mov         x9, 128
+    // loading odd lines
+    add         x4, x2, 64              // odd lines
+    ld1         { v16.8h }, [x4], x9    // line 1
+    ld1         { v17.8h }, [x4], x9    // line 3
+    ld1         { v18.8h }, [x4], x9    // line 5
+    ld1         { v19.8h }, [x4], x9    // line 7
+    ld1         { v20.8h }, [x4], x9    // line 9
+    ld1         { v21.8h }, [x4], x9    // line 11
+    ld1         { v22.8h }, [x4], x9    // line 13
+    ld1         { v23.8h }, [x4], x9    // line 15
+    ld1         { v24.8h }, [x4], x9    // line 17
+    ld1         { v25.8h }, [x4], x9    // line 19
+    ld1         { v26.8h }, [x4], x9    // line 21
+    ld1         { v27.8h }, [x4], x9    // line 23
+    ld1         { v28.8h }, [x4], x9    // line 25
+    ld1         { v29.8h }, [x4], x9    // line 27
+    ld1         { v30.8h }, [x4], x9    // line 29
+    ld1         { v31.8h }, [x4], x9    // line 31
+
+    cmp         x1, 28
+    b.hs        5f
+    // limit2 below 32
+    bic         x4, x1, 1
+    adr         x5, .LimitMask
+    cbnz        x3, 3f
+    // columns 0 .. 7 - cleanup of indexes 5 .. 7
+    ld1         { v0.8h }, [x5]
+    adr         x5, 2f
+    add         x5, x5, x4, lsl 2
+    add         x5, x5, x4, lsl 1
+    br          x5
+2:  and         v16.16b, v16.16b, v0.16b    // col_limit 0..1 -> limit2 == 4..5
+    and         v17.16b, v17.16b, v0.16b
+    b           5f
+    and         v17.16b, v17.16b, v0.16b    // col_limit 2..3 -> limit2 == 6..7
+    and         v18.16b, v18.16b, v0.16b
+    b           5f
+    and         v18.16b, v18.16b, v0.16b    // col_limit 4..5 -> limit2 == 8..9
+    and         v19.16b, v19.16b, v0.16b
+    b           5f
+    and         v19.16b, v19.16b, v0.16b    // col_limit 6..7 -> limit2 == 10..11
+    and         v20.16b, v20.16b, v0.16b
+    b           5f
+    and         v20.16b, v20.16b, v0.16b    // col_limit 8..9 -> limit2 == 12..13
+    and         v21.16b, v21.16b, v0.16b
+    b           5f
+    and         v21.16b, v21.16b, v0.16b    // col_limit 10..11 -> limit2 == 14..15
+    and         v22.16b, v22.16b, v0.16b
+    b           5f
+    and         v22.16b, v22.16b, v0.16b    // col_limit 12..13 -> limit2 == 16..17
+    and         v23.16b, v23.16b, v0.16b
+    b           5f
+    and         v23.16b, v23.16b, v0.16b    // col_limit 14..15 -> limit2 == 18..19
+    and         v24.16b, v24.16b, v0.16b
+    b           5f
+    and         v24.16b, v24.16b, v0.16b    // col_limit 16..17 -> limit2 == 20..21
+    and         v25.16b, v25.16b, v0.16b
+    b           5f
+    and         v25.16b, v25.16b, v0.16b    // col_limit 18..19 -> limit2 == 22..23
+    and         v26.16b, v26.16b, v0.16b
+    b           5f
+    and         v26.16b, v26.16b, v0.16b    // col_limit 20..21 -> limit2 == 24..25
+    and         v27.16b, v27.16b, v0.16b
+    b           5f
+    and         v27.16b, v27.16b, v0.16b    // col_limit 22..23 -> limit2 == 26..27
+    and         v28.16b, v28.16b, v0.16b
+    b           5f
+    and         v28.16b, v28.16b, v0.16b    // col_limit 24..25 -> limit2 == 28..29
+    and         v29.16b, v29.16b, v0.16b
+    b           5f
+    and         v29.16b, v29.16b, v0.16b    // col_limit 26..27 -> limit2 == 30..31
+    and         v30.16b, v30.16b, v0.16b
+    b           5f
+    // columns 8 .. 31
+3:  add         x4, x4, 6
+    subs        x4, x4, x3, lsl 3
+    b.lo        5f
+    ld1         { v0.8h, v1.8h }, [x5]
+    adr         x5, 4f
+    add         x5, x5, x4, lsl 3
+    add         x5, x5, x4, lsl 1
+    br          x5
+4:  and         v16.16b, v16.16b, v1.16b    // limit2 == 2..3
+    b           5f
+    nop
+    nop
+    nop
+    and         v16.16b, v16.16b, v1.16b    // limit2 == 4..5
+    and         v17.16b, v17.16b, v1.16b
+    b           5f
+    nop
+    nop
+    and         v16.16b, v16.16b, v0.16b    // limit2 == 6..7
+    and         v17.16b, v17.16b, v1.16b
+    and         v18.16b, v18.16b, v1.16b
+    b           5f
+    nop
+    and         v16.16b, v16.16b, v0.16b    // limit2 == 8..9
+    and         v17.16b, v17.16b, v0.16b
+    and         v18.16b, v18.16b, v1.16b
+    and         v19.16b, v19.16b, v1.16b
+    b           5f
+    and         v17.16b, v17.16b, v0.16b    // limit2 == 10..11
+    and         v18.16b, v18.16b, v0.16b
+    and         v19.16b, v19.16b, v1.16b
+    and         v20.16b, v20.16b, v1.16b
+    b           5f
+    and         v18.16b, v18.16b, v0.16b    // limit2 == 12..13
+    and         v19.16b, v19.16b, v0.16b
+    and         v20.16b, v20.16b, v1.16b
+    and         v21.16b, v21.16b, v1.16b
+    b           5f
+    and         v19.16b, v19.16b, v0.16b    // limit2 == 14..15
+    and         v20.16b, v20.16b, v0.16b
+    and         v21.16b, v21.16b, v1.16b
+    and         v22.16b, v22.16b, v1.16b
+    b           5f
+    and         v20.16b, v20.16b, v0.16b    // limit2 == 16..17
+    and         v21.16b, v21.16b, v0.16b
+    and         v22.16b, v22.16b, v1.16b
+    and         v23.16b, v23.16b, v1.16b
+    b           5f
+    and         v21.16b, v21.16b, v0.16b    // limit2 == 18..19
+    and         v22.16b, v22.16b, v0.16b
+    and         v23.16b, v23.16b, v1.16b
+    and         v24.16b, v24.16b, v1.16b
+    b           5f
+    and         v22.16b, v22.16b, v0.16b    // limit2 == 20..21
+    and         v23.16b, v23.16b, v0.16b
+    and         v24.16b, v24.16b, v1.16b
+    and         v25.16b, v25.16b, v1.16b
+    b           5f
+    and         v23.16b, v23.16b, v0.16b    // limit2 == 22..23
+    and         v24.16b, v24.16b, v0.16b
+    and         v25.16b, v25.16b, v1.16b
+    and         v26.16b, v26.16b, v1.16b
+    b           5f
+    and         v24.16b, v24.16b, v0.16b    // limit2 == 24..25
+    and         v25.16b, v25.16b, v0.16b
+    and         v26.16b, v26.16b, v1.16b
+    and         v27.16b, v27.16b, v1.16b
+    b           5f
+    and         v25.16b, v25.16b, v0.16b    // limit2 == 26..27
+    and         v26.16b, v26.16b, v0.16b
+    and         v27.16b, v27.16b, v1.16b
+    and         v28.16b, v28.16b, v1.16b
+    b           5f
+
+
+    // o_32
+5:  mov         x5, 32
+    cmp         x1, 28
+    b.hs        6f
+    add         x5, x1, 4
+    bic         x5, x5, 1
+    cbz         x3, 6f
+    add         x5, x1, 6
+    orr         x5, x5, 1
+    subs        x5, x5, x3, lsl 3
+    csel        x5, x5, xzr, hs
+6:  mov         x4, 128
+    sub         x4, x4, x5, lsl 2
+    adr         x5, 8f
+    add         x5, x5, x4
+    adr         x4, .Lo32transform
+    mov         x8, sp
+    mov         x6, 16
+7:  ld1         { v2.8h, v3.8h }, [x4], 32
+    movi        v8.4s, 0
+    movi        v9.4s, 0
+    br          x5
+8:  smlal2      v9.4s, v31.8h, v3.h[7]
+    smlal       v8.4s, v31.4h, v3.h[7]
+    smlal2      v9.4s, v30.8h, v3.h[6]
+    smlal       v8.4s, v30.4h, v3.h[6]
+    smlal2      v9.4s, v29.8h, v3.h[5]
+    smlal       v8.4s, v29.4h, v3.h[5]
+    smlal2      v9.4s, v28.8h, v3.h[4]
+    smlal       v8.4s, v28.4h, v3.h[4]
+    smlal2      v9.4s, v27.8h, v3.h[3]
+    smlal       v8.4s, v27.4h, v3.h[3]
+    smlal2      v9.4s, v26.8h, v3.h[2]
+    smlal       v8.4s, v26.4h, v3.h[2]
+    smlal2      v9.4s, v25.8h, v3.h[1]
+    smlal       v8.4s, v25.4h, v3.h[1]
+    smlal2      v9.4s, v24.8h, v3.h[0]
+    smlal       v8.4s, v24.4h, v3.h[0]
+    smlal2      v9.4s, v23.8h, v2.h[7]
+    smlal       v8.4s, v23.4h, v2.h[7]
+    smlal2      v9.4s, v22.8h, v2.h[6]
+    smlal       v8.4s, v22.4h, v2.h[6]
+    smlal2      v9.4s, v21.8h, v2.h[5]
+    smlal       v8.4s, v21.4h, v2.h[5]
+    smlal2      v9.4s, v20.8h, v2.h[4]
+    smlal       v8.4s, v20.4h, v2.h[4]
+    smlal2      v9.4s, v19.8h, v2.h[3]
+    smlal       v8.4s, v19.4h, v2.h[3]
+    smlal2      v9.4s, v18.8h, v2.h[2]
+    smlal       v8.4s, v18.4h, v2.h[2]
+    smlal2      v9.4s, v17.8h, v2.h[1]
+    smlal       v8.4s, v17.4h, v2.h[1]
+    smlal2      v9.4s, v16.8h, v2.h[0]
+    smlal       v8.4s, v16.4h, v2.h[0]
+    st1         { v8.4s, v9.4s }, [x8], 32
+    subs        x6, x6, 1
+    b.ne        7b
+
+    mov         x4, x2
+    ld1         { v16.8h }, [x4], x9    // line 0
+    ld1         { v17.8h }, [x4], x9    // line 2
+    ld1         { v18.8h }, [x4], x9    // line 4
+    ld1         { v19.8h }, [x4], x9    // line 6
+    ld1         { v20.8h }, [x4], x9    // line 8
+    ld1         { v21.8h }, [x4], x9    // line 10
+    ld1         { v22.8h }, [x4], x9    // line 12
+    ld1         { v23.8h }, [x4], x9    // line 14
+    ld1         { v24.8h }, [x4], x9    // line 16
+    ld1         { v25.8h }, [x4], x9    // line 18
+    ld1         { v26.8h }, [x4], x9    // line 20
+    ld1         { v27.8h }, [x4], x9    // line 22
+    ld1         { v28.8h }, [x4], x9    // line 24
+    ld1         { v29.8h }, [x4], x9    // line 26
+    ld1         { v30.8h }, [x4], x9    // line 28
+    ld1         { v31.8h }, [x4], x9    // line 30
+    cmp         x1, 28
+    b.hs        12f
+    // limit2 below 32
+    bic         x4, x1, 3
+    cbnz        x3, 10f
+    // columns 0 .. 7 - cleanup of indexes 5 .. 7
+    adr         x5, 9f
+    add         x5, x5, x4, lsl 1
+    br          x5
+9:  and         v17.16b, v17.16b, v0.16b    // col_limit 0..3 -> limit2/2 == 2..3
+    b           12f
+    and         v19.16b, v19.16b, v0.16b    // col_limit 4..7 -> limit2/2 == 4..5
+    b           12f
+    and         v21.16b, v21.16b, v0.16b    // col_limit 8..11 -> limit2/2 == 6..7
+    b           12f
+    and         v23.16b, v23.16b, v0.16b    // col_limit 12..15 -> limit2/2 == 8..9
+    b           12f
+    and         v25.16b, v25.16b, v0.16b    // col_limit 16..19 -> limit2/2 == 10..11
+    b           12f
+    and         v27.16b, v27.16b, v0.16b    // col_limit 20..23 -> limit2/2 == 12..13
+    b           12f
+    and         v29.16b, v29.16b, v0.16b    // col_limit 24..27 -> limit2/2 == 14..15
+    b           12f
+    // columns 8 .. 31
+10: add         x4, x4, 4
+    subs        x4, x4, x3, lsl 3           // x4 = (limit2 & ~3)-4  for column 8 * x3
+    b.lo        12f
+    adr         x5, 11f
+    add         x5, x5, x4, lsl 1
+    add         x5, x5, x4
+    br          x5
+11: and         v17.16b, v17.16b, v1.16b    // limit2 == 4..7
+    b           12f
+    nop
+    and         v17.16b, v17.16b, v0.16b    // limit2 == 8..11
+    and         v19.16b, v19.16b, v1.16b
+    b           12f
+    and         v19.16b, v19.16b, v0.16b    // limit2 == 12..15
+    and         v21.16b, v21.16b, v1.16b
+    b           12f
+    and         v21.16b, v21.16b, v0.16b    // limit2 == 16..19
+    and         v23.16b, v23.16b, v1.16b
+    b           12f
+    and         v23.16b, v23.16b, v0.16b    // limit2 == 20..23
+    and         v25.16b, v25.16b, v1.16b
+    b           12f
+    and         v25.16b, v25.16b, v0.16b    // limit2 == 24..27
+    and         v27.16b, v27.16b, v1.16b
+    b           12f
+
+    // v0,v1 = e0
+12: sshll       v0.4s, v16.4h, 6
+    sshll       v1.4s, v24.4h, 6
+    add         v0.4s, v0.4s, v1.4s
+    sshll2      v1.4s, v16.8h, 6
+    sshll2      v2.4s, v24.8h, 6
+    add         v1.4s, v1.4s, v2.4s
+
+    // v2,v3 = o0
+    mov         v14.s[0], w7
+    smull       v2.4s, v20.4h, v14.h[0]
+    smlal       v2.4s, v28.4h, v14.h[1]
+    smull2      v3.4s, v20.8h, v14.h[0]
+    smlal2      v3.4s, v28.8h, v14.h[1]
+
+    // v4,v5 = e_8[0]
+    add         v4.4s, v0.4s, v2.4s
+    add         v5.4s, v1.4s, v3.4s
+
+    // v6,v7 = e_8[3]
+    sub         v6.4s, v0.4s, v2.4s
+    sub         v7.4s, v1.4s, v3.4s
+
+
+    // v0,v1 = o_8[0]
+    adr         x4, .Lo8transform0
+    ld1         { v15.4h }, [x4]
+    smull       v0.4s, v18.4h, v15.h[0]
+    smlal       v0.4s, v22.4h, v15.h[1]
+    smlal       v0.4s, v26.4h, v15.h[2]
+    smlal       v0.4s, v30.4h, v15.h[3]
+    smull2      v1.4s, v18.8h, v15.h[0]
+    smlal2      v1.4s, v22.8h, v15.h[1]
+    smlal2      v1.4s, v26.8h, v15.h[2]
+    smlal2      v1.4s, v30.8h, v15.h[3]
+
+    // v2,v3 = e_16[0]
+    add         v2.4s, v4.4s, v0.4s
+    add         v3.4s, v5.4s, v1.4s
+
+    // v8,v9 = o_16[0]
+    adr         x4, .Lo16transform0
+    ld1         { v15.8h }, [x4]
+    mov         x5, 32
+    cmp         x1, 28
+    b.hs        13f
+    add         x5, x1, 4
+    bic         x5, x5, 3
+    cbz         x3, 13f
+    orr         x5, x5, 2
+    subs        x5, x5, x3, lsl 3
+    csel        x5, x5, xzr, hs
+13: mov         x4, 64
+    sub         x6, x4, x5, lsl 1
+    adr         x5, 14f
+    add         x5, x5, x6
+    movi        v8.4s, 0
+    movi        v9.4s, 0
+    br          x5
+14: smlal2      v9.4s, v31.8h, v15.h[7]
+    smlal       v8.4s, v31.4h, v15.h[7]
+    smlal2      v9.4s, v29.8h, v15.h[6]
+    smlal       v8.4s, v29.4h, v15.h[6]
+    smlal2      v9.4s, v27.8h, v15.h[5]
+    smlal       v8.4s, v27.4h, v15.h[5]
+    smlal2      v9.4s, v25.8h, v15.h[4]
+    smlal       v8.4s, v25.4h, v15.h[4]
+    smlal2      v9.4s, v23.8h, v15.h[3]
+    smlal       v8.4s, v23.4h, v15.h[3]
+    smlal2      v9.4s, v21.8h, v15.h[2]
+    smlal       v8.4s, v21.4h, v15.h[2]
+    smlal2      v9.4s, v19.8h, v15.h[1]
+    smlal       v8.4s, v19.4h, v15.h[1]
+    smlal2      v9.4s, v17.8h, v15.h[0]
+    smlal       v8.4s, v17.4h, v15.h[0]
+
+    // v12,v13 = e_32[0]
+    add         v12.4s, v2.4s, v8.4s
+    add         v13.4s, v3.4s, v9.4s
+
+    // v14,v15 = o_32[0]
+    ld1         { v14.4s, v15.4s }, [sp]
+
+    // tmp[0 * 32]
+    add         v10.4s, v12.4s, v14.4s
+    add         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    st1         { v10.8h }, [x2]
+
+    // tmp[31 * 32]
+    sub         v10.4s, v12.4s, v14.4s
+    sub         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 31 * 64
+    st1         { v10.8h }, [x4]
+
+    // v12,v13 = e_32[15]
+    sub         v12.4s, v2.4s, v8.4s
+    sub         v13.4s, v3.4s, v9.4s
+
+    // v14,v15 = o_32[15]
+    add         x4, sp, 15 * 32
+    ld1         { v14.4s, v15.4s }, [x4]
+
+    // tmp[15 * 32]
+    add         v10.4s, v12.4s, v14.4s
+    add         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 15 * 64
+    st1         { v10.8h }, [x4]
+
+    // tmp[16 * 32]
+    sub         v10.4s, v12.4s, v14.4s
+    sub         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 16 * 64
+    st1         { v10.8h }, [x4]
+
+    // v2,v3 = e_16[7]
+    sub         v2.4s, v4.4s, v0.4s
+    sub         v3.4s, v5.4s, v1.4s
+
+    // v8,v9 = o_16[7]
+    adr         x4, .Lo16transform7
+    ld1         { v15.8h }, [x4]
+    adr         x5, 15f
+    add         x5, x5, x6
+    movi        v8.4s, 0
+    movi        v9.4s, 0
+    br          x5
+15: smlal2      v9.4s, v31.8h, v15.h[7]
+    smlal       v8.4s, v31.4h, v15.h[7]
+    smlal2      v9.4s, v29.8h, v15.h[6]
+    smlal       v8.4s, v29.4h, v15.h[6]
+    smlal2      v9.4s, v27.8h, v15.h[5]
+    smlal       v8.4s, v27.4h, v15.h[5]
+    smlal2      v9.4s, v25.8h, v15.h[4]
+    smlal       v8.4s, v25.4h, v15.h[4]
+    smlal2      v9.4s, v23.8h, v15.h[3]
+    smlal       v8.4s, v23.4h, v15.h[3]
+    smlal2      v9.4s, v21.8h, v15.h[2]
+    smlal       v8.4s, v21.4h, v15.h[2]
+    smlal2      v9.4s, v19.8h, v15.h[1]
+    smlal       v8.4s, v19.4h, v15.h[1]
+    smlal2      v9.4s, v17.8h, v15.h[0]
+    smlal       v8.4s, v17.4h, v15.h[0]
+
+    // v12,v13 = e_32[7]
+    add         v12.4s, v2.4s, v8.4s
+    add         v13.4s, v3.4s, v9.4s
+
+    // v14,v15 = o_32[7]
+    add         x4, sp, 7 * 32
+    ld1         { v14.4s, v15.4s }, [x4]
+
+    // tmp[7 * 32]
+    add         v10.4s, v12.4s, v14.4s
+    add         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 7 * 64
+    st1         { v10.8h }, [x4]
+
+    // tmp[24 * 32]
+    sub         v10.4s, v12.4s, v14.4s
+    sub         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 24 * 64
+    st1         { v10.8h }, [x4]
+
+    // v12,v13 = e_32[8]
+    sub         v12.4s, v2.4s, v8.4s
+    sub         v13.4s, v3.4s, v9.4s
+
+    // v14,v15 = o_32[8]
+    add         x4, sp, 8 * 32
+    ld1         { v14.4s, v15.4s }, [x4]
+
+    // tmp[8 * 32]
+    add         v10.4s, v12.4s, v14.4s
+    add         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 8 * 64
+    st1         { v10.8h }, [x4]
+
+    // tmp[23 * 32]
+    sub         v10.4s, v12.4s, v14.4s
+    sub         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 23 * 64
+    st1         { v10.8h }, [x4]
+
+    // v0,v1 = o_8[3]
+    adr         x4, .Lo8transform3
+    ld1         { v15.4h }, [x4]
+    smull       v0.4s, v18.4h, v15.h[0]
+    smlal       v0.4s, v22.4h, v15.h[1]
+    smlal       v0.4s, v26.4h, v15.h[2]
+    smlal       v0.4s, v30.4h, v15.h[3]
+    smull2      v1.4s, v18.8h, v15.h[0]
+    smlal2      v1.4s, v22.8h, v15.h[1]
+    smlal2      v1.4s, v26.8h, v15.h[2]
+    smlal2      v1.4s, v30.8h, v15.h[3]
+
+    // v2,v3 = e_16[3]
+    add         v2.4s, v6.4s, v0.4s
+    add         v3.4s, v7.4s, v1.4s
+
+    // v8,v9 = o_16[3]
+    adr         x4, .Lo16transform3
+    ld1         { v15.8h }, [x4]
+    adr         x5, 16f
+    add         x5, x5, x6
+    movi        v8.4s, 0
+    movi        v9.4s, 0
+    br          x5
+16: smlal2      v9.4s, v31.8h, v15.h[7]
+    smlal       v8.4s, v31.4h, v15.h[7]
+    smlal2      v9.4s, v29.8h, v15.h[6]
+    smlal       v8.4s, v29.4h, v15.h[6]
+    smlal2      v9.4s, v27.8h, v15.h[5]
+    smlal       v8.4s, v27.4h, v15.h[5]
+    smlal2      v9.4s, v25.8h, v15.h[4]
+    smlal       v8.4s, v25.4h, v15.h[4]
+    smlal2      v9.4s, v23.8h, v15.h[3]
+    smlal       v8.4s, v23.4h, v15.h[3]
+    smlal2      v9.4s, v21.8h, v15.h[2]
+    smlal       v8.4s, v21.4h, v15.h[2]
+    smlal2      v9.4s, v19.8h, v15.h[1]
+    smlal       v8.4s, v19.4h, v15.h[1]
+    smlal2      v9.4s, v17.8h, v15.h[0]
+    smlal       v8.4s, v17.4h, v15.h[0]
+
+    // v12,v13 = e_32[3]
+    add         v12.4s, v2.4s, v8.4s
+    add         v13.4s, v3.4s, v9.4s
+
+    // v14,v15 = o_32[3]
+    add         x4, sp, 3 * 32
+    ld1         { v14.4s, v15.4s }, [x4]
+
+    // tmp[3 * 32]
+    add         v10.4s, v12.4s, v14.4s
+    add         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 3 * 64
+    st1         { v10.8h }, [x4]
+
+    // tmp[28 * 32]
+    sub         v10.4s, v12.4s, v14.4s
+    sub         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 28 * 64
+    st1         { v10.8h }, [x4]
+
+    // v12,v13 = e_32[12]
+    sub         v12.4s, v2.4s, v8.4s
+    sub         v13.4s, v3.4s, v9.4s
+
+    // v14,v15 = o_32[12]
+    add         x4, sp, 12 * 32
+    ld1         { v14.4s, v15.4s }, [x4]
+
+    // tmp[12 * 32]
+    add         v10.4s, v12.4s, v14.4s
+    add         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 12 * 64
+    st1         { v10.8h }, [x4]
+
+    // tmp[19 * 32]
+    sub         v10.4s, v12.4s, v14.4s
+    sub         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 19 * 64
+    st1         { v10.8h }, [x4]
+
+    // v2,v3 = e_16[4]
+    sub         v2.4s, v6.4s, v0.4s
+    sub         v3.4s, v7.4s, v1.4s
+
+    // v8,v9 = o_16[4]
+    adr         x4, .Lo16transform4
+    ld1         { v15.8h }, [x4]
+    adr         x5, 17f
+    add         x5, x5, x6
+    movi        v8.4s, 0
+    movi        v9.4s, 0
+    br          x5
+17: smlal2      v9.4s, v31.8h, v15.h[7]
+    smlal       v8.4s, v31.4h, v15.h[7]
+    smlal2      v9.4s, v29.8h, v15.h[6]
+    smlal       v8.4s, v29.4h, v15.h[6]
+    smlal2      v9.4s, v27.8h, v15.h[5]
+    smlal       v8.4s, v27.4h, v15.h[5]
+    smlal2      v9.4s, v25.8h, v15.h[4]
+    smlal       v8.4s, v25.4h, v15.h[4]
+    smlal2      v9.4s, v23.8h, v15.h[3]
+    smlal       v8.4s, v23.4h, v15.h[3]
+    smlal2      v9.4s, v21.8h, v15.h[2]
+    smlal       v8.4s, v21.4h, v15.h[2]
+    smlal2      v9.4s, v19.8h, v15.h[1]
+    smlal       v8.4s, v19.4h, v15.h[1]
+    smlal2      v9.4s, v17.8h, v15.h[0]
+    smlal       v8.4s, v17.4h, v15.h[0]
+
+    // v12,v13 = e_32[4]
+    add         v12.4s, v2.4s, v8.4s
+    add         v13.4s, v3.4s, v9.4s
+
+    // v14,v15 = o_32[4]
+    add         x4, sp, 4 * 32
+    ld1         { v14.4s, v15.4s }, [x4]
+
+    // tmp[4 * 32]
+    add         v10.4s, v12.4s, v14.4s
+    add         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 4 * 64
+    st1         { v10.8h }, [x4]
+
+    // tmp[27 * 32]
+    sub         v10.4s, v12.4s, v14.4s
+    sub         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 27 * 64
+    st1         { v10.8h }, [x4]
+
+    // v12,v13 = e_32[11]
+    sub         v12.4s, v2.4s, v8.4s
+    sub         v13.4s, v3.4s, v9.4s
+
+    // v14,v15 = o_32[11]
+    add         x4, sp, 11 * 32
+    ld1         { v14.4s, v15.4s }, [x4]
+
+    // tmp[11 * 32]
+    add         v10.4s, v12.4s, v14.4s
+    add         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 11 * 64
+    st1         { v10.8h }, [x4]
+
+    // tmp[20 * 32]
+    sub         v10.4s, v12.4s, v14.4s
+    sub         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 20 * 64
+    st1         { v10.8h }, [x4]
+
+    // v0,v1 = e1
+    sshll       v0.4s, v16.4h, 6
+    sshll       v1.4s, v24.4h, 6
+    sub         v0.4s, v0.4s, v1.4s
+    sshll2      v1.4s, v16.8h, 6
+    sshll2      v2.4s, v24.8h, 6
+    sub         v1.4s, v1.4s, v2.4s
+
+    // v2,v3 = o1
+    mov         v14.s[0], w7
+    smull       v2.4s, v20.4h, v14.h[1]
+    smlsl       v2.4s, v28.4h, v14.h[0]
+    smull2      v3.4s, v20.8h, v14.h[1]
+    smlsl2      v3.4s, v28.8h, v14.h[0]
+
+    // v4,v5 = e_8[1]
+    add         v4.4s, v0.4s, v2.4s
+    add         v5.4s, v1.4s, v3.4s
+
+    // v6,v7 = e_8[2]
+    sub         v6.4s, v0.4s, v2.4s
+    sub         v7.4s, v1.4s, v3.4s
+
+    // v0,v1 = o_8[1]
+    adr         x4, .Lo8transform1
+    ld1         { v15.4h }, [x4]
+    smull       v0.4s, v18.4h, v15.h[0]
+    smlal       v0.4s, v22.4h, v15.h[1]
+    smlal       v0.4s, v26.4h, v15.h[2]
+    smlal       v0.4s, v30.4h, v15.h[3]
+    smull2      v1.4s, v18.8h, v15.h[0]
+    smlal2      v1.4s, v22.8h, v15.h[1]
+    smlal2      v1.4s, v26.8h, v15.h[2]
+    smlal2      v1.4s, v30.8h, v15.h[3]
+
+    // v2,v3 = e_16[1]
+    add         v2.4s, v4.4s, v0.4s
+    add         v3.4s, v5.4s, v1.4s
+
+    // v8,v9 = o_16[1]
+    adr         x4, .Lo16transform1
+    ld1         { v15.8h }, [x4]
+    adr         x5, 18f
+    add         x5, x5, x6
+    movi        v8.4s, 0
+    movi        v9.4s, 0
+    br          x5
+18: smlal2      v9.4s, v31.8h, v15.h[7]
+    smlal       v8.4s, v31.4h, v15.h[7]
+    smlal2      v9.4s, v29.8h, v15.h[6]
+    smlal       v8.4s, v29.4h, v15.h[6]
+    smlal2      v9.4s, v27.8h, v15.h[5]
+    smlal       v8.4s, v27.4h, v15.h[5]
+    smlal2      v9.4s, v25.8h, v15.h[4]
+    smlal       v8.4s, v25.4h, v15.h[4]
+    smlal2      v9.4s, v23.8h, v15.h[3]
+    smlal       v8.4s, v23.4h, v15.h[3]
+    smlal2      v9.4s, v21.8h, v15.h[2]
+    smlal       v8.4s, v21.4h, v15.h[2]
+    smlal2      v9.4s, v19.8h, v15.h[1]
+    smlal       v8.4s, v19.4h, v15.h[1]
+    smlal2      v9.4s, v17.8h, v15.h[0]
+    smlal       v8.4s, v17.4h, v15.h[0]
+
+    // v12,v13 = e_32[1]
+    add         v12.4s, v2.4s, v8.4s
+    add         v13.4s, v3.4s, v9.4s
+
+    // v14,v15 = o_32[1]
+    add         x4, sp, 1 * 32
+    ld1         { v14.4s, v15.4s }, [x4]
+
+    // tmp[1 * 32]
+    add         v10.4s, v12.4s, v14.4s
+    add         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 1 * 64
+    st1         { v10.8h }, [x4]
+
+    // tmp[30 * 32]
+    sub         v10.4s, v12.4s, v14.4s
+    sub         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 30 * 64
+    st1         { v10.8h }, [x4]
+
+    // v12,v13 = e_32[14]
+    sub         v12.4s, v2.4s, v8.4s
+    sub         v13.4s, v3.4s, v9.4s
+
+    // v14,v15 = o_32[14]
+    add         x4, sp, 14 * 32
+    ld1         { v14.4s, v15.4s }, [x4]
+
+    // tmp[14 * 32]
+    add         v10.4s, v12.4s, v14.4s
+    add         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 14 * 64
+    st1         { v10.8h }, [x4]
+
+    // tmp[17 * 32]
+    sub         v10.4s, v12.4s, v14.4s
+    sub         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 17 * 64
+    st1         { v10.8h }, [x4]
+
+    // v2,v3 = e_16[6]
+    sub         v2.4s, v4.4s, v0.4s
+    sub         v3.4s, v5.4s, v1.4s
+
+    // v8,v9 = o_16[6]
+    adr         x4, .Lo16transform6
+    ld1         { v15.8h }, [x4]
+    adr         x5, 19f
+    add         x5, x5, x6
+    movi        v8.4s, 0
+    movi        v9.4s, 0
+    br          x5
+19: smlal2      v9.4s, v31.8h, v15.h[7]
+    smlal       v8.4s, v31.4h, v15.h[7]
+    smlal2      v9.4s, v29.8h, v15.h[6]
+    smlal       v8.4s, v29.4h, v15.h[6]
+    smlal2      v9.4s, v27.8h, v15.h[5]
+    smlal       v8.4s, v27.4h, v15.h[5]
+    smlal2      v9.4s, v25.8h, v15.h[4]
+    smlal       v8.4s, v25.4h, v15.h[4]
+    smlal2      v9.4s, v23.8h, v15.h[3]
+    smlal       v8.4s, v23.4h, v15.h[3]
+    smlal2      v9.4s, v21.8h, v15.h[2]
+    smlal       v8.4s, v21.4h, v15.h[2]
+    smlal2      v9.4s, v19.8h, v15.h[1]
+    smlal       v8.4s, v19.4h, v15.h[1]
+    smlal2      v9.4s, v17.8h, v15.h[0]
+    smlal       v8.4s, v17.4h, v15.h[0]
+
+    // v12,v13 = e_32[6]
+    add         v12.4s, v2.4s, v8.4s
+    add         v13.4s, v3.4s, v9.4s
+
+    // v14,v15 = o_32[6]
+    add         x4, sp, 6 * 32
+    ld1         { v14.4s, v15.4s }, [x4]
+
+    // tmp[6 * 32]
+    add         v10.4s, v12.4s, v14.4s
+    add         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 6 * 64
+    st1         { v10.8h }, [x4]
+
+    // tmp[25 * 32]
+    sub         v10.4s, v12.4s, v14.4s
+    sub         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 25 * 64
+    st1         { v10.8h }, [x4]
+
+    // v12,v13 = e_32[9]
+    sub         v12.4s, v2.4s, v8.4s
+    sub         v13.4s, v3.4s, v9.4s
+
+    // v14,v15 = o_32[9]
+    add         x4, sp, 9 * 32
+    ld1         { v14.4s, v15.4s }, [x4]
+
+    // tmp[9 * 32]
+    add         v10.4s, v12.4s, v14.4s
+    add         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 9 * 64
+    st1         { v10.8h }, [x4]
+
+    // tmp[22 * 32]
+    sub         v10.4s, v12.4s, v14.4s
+    sub         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 22 * 64
+    st1         { v10.8h }, [x4]
+
+    // v0,v1 = o_8[2]
+    adr         x4, .Lo8transform2
+    ld1         { v15.4h }, [x4]
+    smull       v0.4s, v18.4h, v15.h[0]
+    smlal       v0.4s, v22.4h, v15.h[1]
+    smlal       v0.4s, v26.4h, v15.h[2]
+    smlal       v0.4s, v30.4h, v15.h[3]
+    smull2      v1.4s, v18.8h, v15.h[0]
+    smlal2      v1.4s, v22.8h, v15.h[1]
+    smlal2      v1.4s, v26.8h, v15.h[2]
+    smlal2      v1.4s, v30.8h, v15.h[3]
+
+    // v2,v3 = e_16[2]
+    add         v2.4s, v6.4s, v0.4s
+    add         v3.4s, v7.4s, v1.4s
+
+    // v8,v9 = o_16[2]
+    adr         x4, .Lo16transform2
+    ld1         { v15.8h }, [x4]
+    adr         x5, 20f
+    add         x5, x5, x6
+    movi        v8.4s, 0
+    movi        v9.4s, 0
+    br          x5
+20: smlal2      v9.4s, v31.8h, v15.h[7]
+    smlal       v8.4s, v31.4h, v15.h[7]
+    smlal2      v9.4s, v29.8h, v15.h[6]
+    smlal       v8.4s, v29.4h, v15.h[6]
+    smlal2      v9.4s, v27.8h, v15.h[5]
+    smlal       v8.4s, v27.4h, v15.h[5]
+    smlal2      v9.4s, v25.8h, v15.h[4]
+    smlal       v8.4s, v25.4h, v15.h[4]
+    smlal2      v9.4s, v23.8h, v15.h[3]
+    smlal       v8.4s, v23.4h, v15.h[3]
+    smlal2      v9.4s, v21.8h, v15.h[2]
+    smlal       v8.4s, v21.4h, v15.h[2]
+    smlal2      v9.4s, v19.8h, v15.h[1]
+    smlal       v8.4s, v19.4h, v15.h[1]
+    smlal2      v9.4s, v17.8h, v15.h[0]
+    smlal       v8.4s, v17.4h, v15.h[0]
+
+    // v12,v13 = e_32[2]
+    add         v12.4s, v2.4s, v8.4s
+    add         v13.4s, v3.4s, v9.4s
+
+    // v14,v15 = o_32[2]
+    add         x4, sp, 2 * 32
+    ld1         { v14.4s, v15.4s }, [x4]
+
+    // tmp[2 * 32]
+    add         v10.4s, v12.4s, v14.4s
+    add         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 2 * 64
+    st1         { v10.8h }, [x4]
+
+    // tmp[29 * 32]
+    sub         v10.4s, v12.4s, v14.4s
+    sub         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 29 * 64
+    st1         { v10.8h }, [x4]
+
+    // v12,v13 = e_32[13]
+    sub         v12.4s, v2.4s, v8.4s
+    sub         v13.4s, v3.4s, v9.4s
+
+    // v14,v15 = o_32[13]
+    add         x4, sp, 13 * 32
+    ld1         { v14.4s, v15.4s }, [x4]
+
+    // tmp[13 * 32]
+    add         v10.4s, v12.4s, v14.4s
+    add         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 13 * 64
+    st1         { v10.8h }, [x4]
+
+    // tmp[18 * 32]
+    sub         v10.4s, v12.4s, v14.4s
+    sub         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 18 * 64
+    st1         { v10.8h }, [x4]
+
+    // v2,v3 = e_16[5]
+    sub         v2.4s, v6.4s, v0.4s
+    sub         v3.4s, v7.4s, v1.4s
+
+    // v8,v9 = o_16[5]
+    adr         x4, .Lo16transform5
+    ld1         { v15.8h }, [x4]
+    adr         x5, 21f
+    add         x5, x5, x6
+    movi        v8.4s, 0
+    movi        v9.4s, 0
+    br          x5
+21: smlal2      v9.4s, v31.8h, v15.h[7]
+    smlal       v8.4s, v31.4h, v15.h[7]
+    smlal2      v9.4s, v29.8h, v15.h[6]
+    smlal       v8.4s, v29.4h, v15.h[6]
+    smlal2      v9.4s, v27.8h, v15.h[5]
+    smlal       v8.4s, v27.4h, v15.h[5]
+    smlal2      v9.4s, v25.8h, v15.h[4]
+    smlal       v8.4s, v25.4h, v15.h[4]
+    smlal2      v9.4s, v23.8h, v15.h[3]
+    smlal       v8.4s, v23.4h, v15.h[3]
+    smlal2      v9.4s, v21.8h, v15.h[2]
+    smlal       v8.4s, v21.4h, v15.h[2]
+    smlal2      v9.4s, v19.8h, v15.h[1]
+    smlal       v8.4s, v19.4h, v15.h[1]
+    smlal2      v9.4s, v17.8h, v15.h[0]
+    smlal       v8.4s, v17.4h, v15.h[0]
+
+    // v12,v13 = e_32[5]
+    add         v12.4s, v2.4s, v8.4s
+    add         v13.4s, v3.4s, v9.4s
+
+    // v14,v15 = o_32[5]
+    add         x4, sp, 5 * 32
+    ld1         { v14.4s, v15.4s }, [x4]
+
+    // tmp[5 * 32]
+    add         v10.4s, v12.4s, v14.4s
+    add         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 5 * 64
+    st1         { v10.8h }, [x4]
+
+    // tmp[26 * 32]
+    sub         v10.4s, v12.4s, v14.4s
+    sub         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 26 * 64
+    st1         { v10.8h }, [x4]
+
+    // v12,v13 = e_32[10]
+    sub         v12.4s, v2.4s, v8.4s
+    sub         v13.4s, v3.4s, v9.4s
+
+    // v14,v15 = o_32[10]
+    add         x4, sp, 10 * 32
+    ld1         { v14.4s, v15.4s }, [x4]
+
+    // tmp[10 * 32]
+    add         v10.4s, v12.4s, v14.4s
+    add         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 10 * 64
+    st1         { v10.8h }, [x4]
+
+    // tmp[21 * 32]
+    sub         v10.4s, v12.4s, v14.4s
+    sub         v11.4s, v13.4s, v15.4s
+    sqrshrn     v10.4h, v10.4s, 7
+    sqrshrn2    v10.8h, v11.4s, 7
+    add         x4, x2, 21 * 64
+    st1         { v10.8h }, [x4]
+
+
+    add         x2, x2, 16
+    add         x3, x3, 1
+    cmp         x3, 4
+    b.ne        1b
+
+
+#if 0
+    b           99f
+.globl ff_hevc_idct_32x32_8_neonB
+ff_hevc_idct_32x32_8_neonB:
+    sub         sp, sp, 64
+    st1         { v8.16b, v9.16b, v10.16b, v11.16b }, [sp]
+    sub         sp, sp, 64
+    st1         { v12.16b, v13.16b, v14.16b, v15.16b }, [sp]
+    sub         sp, sp, 16 * 32 * 4     // room for o_32: 16 * 32 values
+#endif
+
+
+
+    // horizontal transform
+    cmp         x1, 9
+    b.ls        24f
+    // o_32 partially (last 12 sum components)
+    adr         x4, .Lo32transform9_31
+    ld1         { v8.8h, v9.8h, v10.8h, v11.8h }, [x4], 64
+    ld1         { v12.8h, v13.8h, v14.8h, v15.8h }, [x4], 64
+    ld1         { v16.8h, v17.8h, v18.8h, v19.8h }, [x4], 64
+    ld1         { v20.8h, v21.8h, v22.8h, v23.8h }, [x4], 64
+    ld1         { v24.8h, v25.8h, v26.8h, v27.8h }, [x4], 64
+    ld1         { v28.8h, v29.8h, v30.8h, v31.8h }, [x4], 64
+    bic         x5, x1, 1
+    subs        x5, x5, 8
+    csel        x5, x5, xzr, hs
+    mov         x4, 24
+    subs        x4, x4, x5
+    csel        x5, x4, xzr, hs
+    adr         x4, 23f
+    add         x5, x4, x5, lsl 3
+    add         x2, x0, 16
+    mov         x8, sp
+    mov         x3, 64
+    mov         x6, 32
+22: ld1         { v0.8h, v1.8h, v2.8h }, [x2], x3
+    movi        v4.4s, 0
+    movi        v5.4s, 0
+    movi        v6.4s, 0
+    movi        v7.4s, 0
+    br          x5
+23: smlal       v4.4s, v30.4h, v2.h[7]
+    smlal2      v5.4s, v30.8h, v2.h[7]
+    smlal       v6.4s, v31.4h, v2.h[7]
+    smlal2      v7.4s, v31.8h, v2.h[7]
+    smlal       v4.4s, v28.4h, v2.h[5]
+    smlal2      v5.4s, v28.8h, v2.h[5]
+    smlal       v6.4s, v29.4h, v2.h[5]
+    smlal2      v7.4s, v29.8h, v2.h[5]
+    smlal       v4.4s, v26.4h, v2.h[3]
+    smlal2      v5.4s, v26.8h, v2.h[3]
+    smlal       v6.4s, v27.4h, v2.h[3]
+    smlal2      v7.4s, v27.8h, v2.h[3]
+    smlal       v4.4s, v24.4h, v2.h[1]
+    smlal2      v5.4s, v24.8h, v2.h[1]
+    smlal       v6.4s, v25.4h, v2.h[1]
+    smlal2      v7.4s, v25.8h, v2.h[1]
+    smlal       v4.4s, v22.4h, v1.h[7]
+    smlal2      v5.4s, v22.8h, v1.h[7]
+    smlal       v6.4s, v23.4h, v1.h[7]
+    smlal2      v7.4s, v23.8h, v1.h[7]
+    smlal       v4.4s, v20.4h, v1.h[5]
+    smlal2      v5.4s, v20.8h, v1.h[5]
+    smlal       v6.4s, v21.4h, v1.h[5]
+    smlal2      v7.4s, v21.8h, v1.h[5]
+    smlal       v4.4s, v18.4h, v1.h[3]
+    smlal2      v5.4s, v18.8h, v1.h[3]
+    smlal       v6.4s, v19.4h, v1.h[3]
+    smlal2      v7.4s, v19.8h, v1.h[3]
+    smlal       v4.4s, v16.4h, v1.h[1]
+    smlal2      v5.4s, v16.8h, v1.h[1]
+    smlal       v6.4s, v17.4h, v1.h[1]
+    smlal2      v7.4s, v17.8h, v1.h[1]
+    smlal       v4.4s, v14.4h, v0.h[7]
+    smlal2      v5.4s, v14.8h, v0.h[7]
+    smlal       v6.4s, v15.4h, v0.h[7]
+    smlal2      v7.4s, v15.8h, v0.h[7]
+    smlal       v4.4s, v12.4h, v0.h[5]
+    smlal2      v5.4s, v12.8h, v0.h[5]
+    smlal       v6.4s, v13.4h, v0.h[5]
+    smlal2      v7.4s, v13.8h, v0.h[5]
+    smlal       v4.4s, v10.4h, v0.h[3]
+    smlal2      v5.4s, v10.8h, v0.h[3]
+    smlal       v6.4s, v11.4h, v0.h[3]
+    smlal2      v7.4s, v11.8h, v0.h[3]
+    smlal       v4.4s, v8.4h, v0.h[1]
+    smlal2      v5.4s, v8.8h, v0.h[1]
+    smlal       v6.4s, v9.4h, v0.h[1]
+    smlal2      v7.4s, v9.8h, v0.h[1]
+    st1         { v4.4s, v5.4s, v6.4s, v7.4s }, [x8], 64
+    subs        x6, x6, 1
+    b.ne        22b
+
+
+24: adr         x4, .Leo_coeff
+    ld1         { v12.8h, v13.8h, v14.8h, v15.8h }, [x4], 64
+    ld1         { v16.8h, v17.8h, v18.8h, v19.8h }, [x4], 64
+    ld1         { v20.8h, v21.8h, v22.8h, v23.8h }, [x4], 64
+    adr         x4, .Lo32transform
+    ld1         { v24.8h, v25.8h, v26.8h, v27.8h }, [x4], 64
+    ld1         { v28.8h, v29.8h, v30.8h, v31.8h }, [x4], 64
+    // o_16 jump address
+    mov         x4, 64
+    bic         x5, x1, 3
+    subs        x4, x4, x5, lsl 1
+    csel        x4, x4, xzr, hs
+    adr         x5, 26f
+    add         x5, x5, x4
+    // o_32 jump address
+    bic         x6, x1, 1
+    mov         x4, 8
+    subs        x4, x4, x6
+    csel        x6, x4, xzr, hs
+    adr         x4, 29f
+    add         x6, x4, x6, lsl 3
+
+    mov         x8, sp
+    mov         x3, 32
+25: ld1         { v8.8h, v9.8h, v10.8h, v11.8h }, [x0]
+
+    // v2 = e_8
+    smull       v2.4s, v12.4h, v8.h[0]
+    smlal2      v2.4s, v12.8h, v9.h[0]
+    smlal       v2.4s, v13.4h, v10.h[0]
+    smlal2      v2.4s, v13.8h, v11.h[0]
+
+    // v3 = o_8
+    smull       v3.4s, v14.4h, v8.h[4]
+    smlal2      v3.4s, v14.8h, v9.h[4]
+    smlal       v3.4s, v15.4h, v10.h[4]
+    smlal2      v3.4s, v15.8h, v11.h[4]
+
+    // v0,v1 = e_16
+    add         v0.4s, v2.4s, v3.4s
+    sub         v2.4s, v2.4s, v3.4s
+    mov         v1.d[0], v2.d[1]
+    mov         v1.d[1], v2.d[0]
+    rev64       v1.4s, v1.4s
+
+    // v2,v3 = o_16
+    movi        v2.4s, 0
+    movi        v3.4s, 0
+    br          x5
+26: smlal       v2.4s, v23.4h, v11.h[6]
+    smlal2      v3.4s, v23.8h, v11.h[6]
+    smlal       v2.4s, v22.4h, v11.h[2]
+    smlal2      v3.4s, v22.8h, v11.h[2]
+    smlal       v2.4s, v21.4h, v10.h[6]
+    smlal2      v3.4s, v21.8h, v10.h[6]
+    smlal       v2.4s, v20.4h, v10.h[2]
+    smlal2      v3.4s, v20.8h, v10.h[2]
+    smlal       v2.4s, v19.4h, v9.h[6]
+    smlal2      v3.4s, v19.8h, v9.h[6]
+    smlal       v2.4s, v18.4h, v9.h[2]
+    smlal2      v3.4s, v18.8h, v9.h[2]
+    smlal       v2.4s, v17.4h, v8.h[6]
+    smlal2      v3.4s, v17.8h, v8.h[6]
+    smlal       v2.4s, v16.4h, v8.h[2]
+    smlal2      v3.4s, v16.8h, v8.h[2]
+
+    // v4,v5,v6,v7 = e_32
+    add         v4.4s, v0.4s, v2.4s
+    add         v5.4s, v1.4s, v3.4s
+    sub         v11.4s, v0.4s, v2.4s
+    mov         v7.d[0], v11.d[1]
+    mov         v7.d[1], v11.d[0]
+    rev64       v7.4s, v7.4s
+    sub         v11.4s, v1.4s, v3.4s
+    mov         v6.d[0], v11.d[1]
+    mov         v6.d[1], v11.d[0]
+    rev64       v6.4s, v6.4s
+
+    // v0,v1,v2,v3 = o_32
+    cmp         x1, 9
+    b.hi        28f
+    movi        v0.4s, 0
+    movi        v1.4s, 0
+    movi        v2.4s, 0
+    movi        v3.4s, 0
+    br          x6
+28: ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x8], 64
+    br          x6
+29: smlal       v0.4s, v30.4h, v8.h[7]
+    smlal2      v1.4s, v30.8h, v8.h[7]
+    smlal       v2.4s, v31.4h, v8.h[7]
+    smlal2      v3.4s, v31.8h, v8.h[7]
+    smlal       v0.4s, v28.4h, v8.h[5]
+    smlal2      v1.4s, v28.8h, v8.h[5]
+    smlal       v2.4s, v29.4h, v8.h[5]
+    smlal2      v3.4s, v29.8h, v8.h[5]
+    smlal       v0.4s, v26.4h, v8.h[3]
+    smlal2      v1.4s, v26.8h, v8.h[3]
+    smlal       v2.4s, v27.4h, v8.h[3]
+    smlal2      v3.4s, v27.8h, v8.h[3]
+    smlal       v0.4s, v24.4h, v8.h[1]
+    smlal2      v1.4s, v24.8h, v8.h[1]
+    smlal       v2.4s, v25.4h, v8.h[1]
+    smlal2      v3.4s, v25.8h, v8.h[1]
+
+    // coeff
+    add         v8.4s, v4.4s, v0.4s
+    add         v9.4s, v5.4s, v1.4s
+    add         v10.4s, v6.4s, v2.4s
+    add         v11.4s, v7.4s, v3.4s
+    sub         v4.4s, v4.4s, v0.4s
+    sub         v5.4s, v5.4s, v1.4s
+    sub         v6.4s, v6.4s, v2.4s
+    sub         v7.4s, v7.4s, v3.4s
+    sqrshrn     v8.4h, v8.4s, 12
+    sqrshrn2    v8.8h, v9.4s, 12
+    sqrshrn     v9.4h, v10.4s, 12
+    sqrshrn2    v9.8h, v11.4s, 12
+    sqrshrn     v4.4h, v4.4s, 12
+    sqrshrn2    v4.8h, v5.4s, 12
+    sqrshrn     v5.4h, v6.4s, 12
+    sqrshrn2    v5.8h, v7.4s, 12
+    mov         v10.d[0], v5.d[1]
+    mov         v10.d[1], v5.d[0]
+    rev64       v10.8h, v10.8h
+    mov         v11.d[0], v4.d[1]
+    mov         v11.d[1], v4.d[0]
+    rev64       v11.8h, v11.8h
+    st1         { v8.8h, v9.8h, v10.8h, v11.8h }, [x0], 64
+    subs        x3, x3, 1
+    b.ne        25b
+
+    add         sp, sp, 16 * 32 * 4
+    ld1         { v12.16b, v13.16b, v14.16b, v15.16b }, [sp], 64
+    ld1         { v8.16b, v9.16b, v10.16b, v11.16b }, [sp], 64
+    ret
+endfunc
+
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
new file mode 100644
index 0000000000..6b290bb87d
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -0,0 +1,170 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/hevcdsp.h"
+#include "libavcodec/hevcdec.h"
+#include "libavcodec/get_bits.h"
+#include "libavcodec/hevc.h"
+#include "libavutil/aarch64/cpu.h"
+
+
+#define NEON8_FNPROTO(fn, args) \
+    void ff_hevc_put_hevc_##fn##4_8_neon args; \
+    void ff_hevc_put_hevc_##fn##6_8_neon args; \
+    void ff_hevc_put_hevc_##fn##8_8_neon args; \
+    void ff_hevc_put_hevc_##fn##12_8_neon args; \
+    void ff_hevc_put_hevc_##fn##16_8_neon args; \
+    void ff_hevc_put_hevc_##fn##24_8_neon args; \
+    void ff_hevc_put_hevc_##fn##32_8_neon args; \
+    void ff_hevc_put_hevc_##fn##48_8_neon args; \
+    void ff_hevc_put_hevc_##fn##64_8_neon args; \
+
+NEON8_FNPROTO(pel_pixels, (int16_t *dst,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(pel_bi_pixels, (uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_h, (int16_t *dst,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_v, (int16_t *dst,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_hv, (int16_t *dst,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_uni_v, (uint8_t *dst,  ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_uni_hv, (uint8_t *dst, ptrdiff_t _dststride,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_bi_h, (uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_bi_v, (uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_bi_hv, (uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_h, (int16_t *dst,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_v, (int16_t *dst,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_hv, (int16_t *dst,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_h, (uint8_t *dst,  ptrdiff_t dststride,
+            uint8_t *src, ptrdiff_t srcstride,
+            int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst,  ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_hv, (uint8_t *dst,  ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_h, (uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_v, (uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_hv, (uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+        int height, intptr_t mx, intptr_t my, int width));
+
+void ff_hevc_sao_edge_filter_8_neon(uint8_t *dst, uint8_t *src,
+        ptrdiff_t stride_dst, int16_t *sao_offset_val,
+        int eo, int width, int height);
+
+void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_32x32_8_neon(int16_t *coeffs, int col_limit);
+
+
+#define NEON8_FNASSIGN(member, v, h, fn) \
+        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon;  \
+        member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon;  \
+        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon;  \
+        member[4][v][h] = ff_hevc_put_hevc_##fn##12_8_neon; \
+        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon; \
+        member[6][v][h] = ff_hevc_put_hevc_##fn##24_8_neon; \
+        member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon; \
+        member[8][v][h] = ff_hevc_put_hevc_##fn##48_8_neon; \
+        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon;
+
+
+av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags) && bit_depth == 8) {
+        NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels);
+        NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h);
+        NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v);
+        NEON8_FNASSIGN(c->put_hevc_epel, 1, 1, epel_hv);
+        NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v);
+        NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 1, epel_uni_hv);
+        NEON8_FNASSIGN(c->put_hevc_epel_bi, 0, 0, pel_bi_pixels);
+        NEON8_FNASSIGN(c->put_hevc_epel_bi, 0, 1, epel_bi_h);
+        NEON8_FNASSIGN(c->put_hevc_epel_bi, 1, 0, epel_bi_v);
+        NEON8_FNASSIGN(c->put_hevc_epel_bi, 1, 1, epel_bi_hv);
+        NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels);
+        NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h);
+        NEON8_FNASSIGN(c->put_hevc_qpel, 1, 0, qpel_v);
+        NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 1, qpel_uni_h);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 0, qpel_uni_v);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv);
+        NEON8_FNASSIGN(c->put_hevc_qpel_bi, 0, 0, pel_bi_pixels);
+        NEON8_FNASSIGN(c->put_hevc_qpel_bi, 0, 1, qpel_bi_h);
+        NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 0, qpel_bi_v);
+        NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 1, qpel_bi_hv);
+
+        c->sao_edge_filter[0] =
+        c->sao_edge_filter[1] =
+        c->sao_edge_filter[2] =
+        c->sao_edge_filter[3] =
+        c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_8_neon;
+
+        c->idct[2]            = ff_hevc_idct_16x16_8_neon;
+        c->idct[3]            = ff_hevc_idct_32x32_8_neon;
+    }
+}
+
diff --git a/libavcodec/aarch64/hevcdsp_qpel_8.S b/libavcodec/aarch64/hevcdsp_qpel_8.S
new file mode 100644
index 0000000000..4a3d96f9c1
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_qpel_8.S
@@ -0,0 +1,5666 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+
+.Lqpel_filters:
+    .byte  0,  0,  0,  0,  0,  0,  0,  0
+    .byte -1,  4,-10, 58, 17, -5,  1,  0
+    .byte -1,  4,-11, 40, 40,-11,  4, -1
+    .byte  0,  1, -5, 17, 58,-10,  4, -1
+
+.macro load_qpel_filterb freg, xreg
+    adr         \xreg, .Lqpel_filters
+    add         \xreg, \xreg, \freg, lsl 3
+    ld4r        { v0.16b, v1.16b, v2.16b, v3.16b }, [\xreg], 4
+    ld4r        { v4.16b, v5.16b, v6.16b, v7.16b }, [\xreg]
+    neg         v0.16b, v0.16b
+    neg         v2.16b, v2.16b
+    neg         v5.16b, v5.16b
+    neg         v7.16b, v7.16b
+.endm
+
+.macro calc_qpelb dst, src0, src1, src2, src3, src4, src5, src6, src7
+    umlsl       \dst\().8h, \src0\().8b, v0.8b
+    umlal       \dst\().8h, \src1\().8b, v1.8b
+    umlsl       \dst\().8h, \src2\().8b, v2.8b
+    umlal       \dst\().8h, \src3\().8b, v3.8b
+    umlal       \dst\().8h, \src4\().8b, v4.8b
+    umlsl       \dst\().8h, \src5\().8b, v5.8b
+    umlal       \dst\().8h, \src6\().8b, v6.8b
+    umlsl       \dst\().8h, \src7\().8b, v7.8b
+.endm
+
+.macro calc_qpelb2 dst, src0, src1, src2, src3, src4, src5, src6, src7
+    umlsl2      \dst\().8h, \src0\().16b, v0.16b
+    umlal2      \dst\().8h, \src1\().16b, v1.16b
+    umlsl2      \dst\().8h, \src2\().16b, v2.16b
+    umlal2      \dst\().8h, \src3\().16b, v3.16b
+    umlal2      \dst\().8h, \src4\().16b, v4.16b
+    umlsl2      \dst\().8h, \src5\().16b, v5.16b
+    umlal2      \dst\().8h, \src6\().16b, v6.16b
+    umlsl2      \dst\().8h, \src7\().16b, v7.16b
+.endm
+
+.macro load_qpel_filterh freg, xreg
+    adr         \xreg, .Lqpel_filters
+    add         \xreg, \xreg, \freg, lsl 3
+    ld1         { v0.8b }, [\xreg]
+    sxtl        v0.8h, v0.8b
+.endm
+
+.macro calc_qpelh dst, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6
+    smull       \dst\().4s, \src0\().4h, v0.h[0]
+    smlal       \dst\().4s, \src1\().4h, v0.h[1]
+    smlal       \dst\().4s, \src2\().4h, v0.h[2]
+    smlal       \dst\().4s, \src3\().4h, v0.h[3]
+    smlal       \dst\().4s, \src4\().4h, v0.h[4]
+    smlal       \dst\().4s, \src5\().4h, v0.h[5]
+    smlal       \dst\().4s, \src6\().4h, v0.h[6]
+    smlal       \dst\().4s, \src7\().4h, v0.h[7]
+.ifeqs "\op", "sshr"
+    sshr        \dst\().4s, \dst\().4s, \shift
+.else
+    \op         \dst\().4h, \dst\().4s, \shift
+.endif
+.endm
+
+.macro calc_qpelh2 dst, dstt, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6
+    smull2      \dstt\().4s, \src0\().8h, v0.h[0]
+    smlal2      \dstt\().4s, \src1\().8h, v0.h[1]
+    smlal2      \dstt\().4s, \src2\().8h, v0.h[2]
+    smlal2      \dstt\().4s, \src3\().8h, v0.h[3]
+    smlal2      \dstt\().4s, \src4\().8h, v0.h[4]
+    smlal2      \dstt\().4s, \src5\().8h, v0.h[5]
+    smlal2      \dstt\().4s, \src6\().8h, v0.h[6]
+    smlal2      \dstt\().4s, \src7\().8h, v0.h[7]
+.ifeqs "\op", "sshr"
+    sshr        \dst\().4s, \dstt\().4s, \shift
+.else
+    \op         \dst\().8h, \dstt\().4s, \shift
+.endif
+.endm
+
+
+function ff_hevc_put_hevc_qpel_h4_8_neon, export=1
+	load_qpel_filterb x4, x5
+	sub			x1, x1, 3
+	sub			x2, x2, 8
+	mov			x14, 128
+1:	ld1			{ v16.8b }, [x1], 8
+	ld1			{ v17.s }[0], [x1], x2
+	ushr		v18.2d, v16.2d, 8
+	mov			v18.b[7], v17.b[0]
+	ushr		v19.2d, v18.2d, 8
+	mov			v19.b[7], v17.b[1]
+	ushr		v20.2d, v19.2d, 8
+	mov			v20.b[7], v17.b[2]
+	ushr		v21.2d, v20.2d, 8
+	ushr		v22.2d, v21.2d, 8
+	ushr		v23.2d, v22.2d, 8
+	ushr		v24.2d, v23.2d, 8
+	movi		v28.8h, 0
+	calc_qpelb	v28, v16, v18, v19, v20, v21, v22, v23, v24
+	st1			{ v28.4h }, [x0], x14
+	subs		x3, x3, 1
+	b.ne		1b
+	ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h6_8_neon, export=1
+	load_qpel_filterb x4, x5
+	sub			x1, x1, 3
+	mov			x14, 120
+1:	ld1			{ v16.8b, v17.8b }, [x1], x2
+	ushr		v18.2d, v16.2d, 8
+	mov			v18.b[7], v17.b[0]
+	ushr		v19.2d, v18.2d, 8
+	mov			v19.b[7], v17.b[1]
+	ushr		v20.2d, v19.2d, 8
+	mov			v20.b[7], v17.b[2]
+	ushr		v21.2d, v20.2d, 8
+	mov			v21.b[7], v17.b[3]
+	ushr		v22.2d, v21.2d, 8
+	mov			v22.b[7], v17.b[4]
+	ushr		v23.2d, v22.2d, 8
+	ushr		v24.2d, v23.2d, 8
+	movi		v28.8h, 0
+	calc_qpelb	v28, v16, v18, v19, v20, v21, v22, v23, v24
+	st1			{ v28.4h }, [x0], 8
+	st1			{ v28.s }[2], [x0], x14
+	subs		x3, x3, 1
+	b.ne		1b
+	ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h8_8_neon, export=1
+	load_qpel_filterb x4, x5
+	sub			x1, x1, 3
+	mov			x14, 128
+1:	ld1			{ v16.8b, v17.8b }, [x1], x2
+	ushr		v18.2d, v16.2d, 8
+	mov			v18.b[7], v17.b[0]
+	ushr		v19.2d, v18.2d, 8
+	mov			v19.b[7], v17.b[1]
+	ushr		v20.2d, v19.2d, 8
+	mov			v20.b[7], v17.b[2]
+	ushr		v21.2d, v20.2d, 8
+	mov			v21.b[7], v17.b[3]
+	ushr		v22.2d, v21.2d, 8
+	mov			v22.b[7], v17.b[4]
+	ushr		v23.2d, v22.2d, 8
+	mov			v23.b[7], v17.b[5]
+	ushr		v24.2d, v23.2d, 8
+	mov			v24.b[7], v17.b[6]
+	movi		v28.8h, 0
+	calc_qpelb	v28, v16, v18, v19, v20, v21, v22, v23, v24
+	st1			{ v28.8h }, [x0], x14
+	subs		x3, x3, 1
+	b.ne		1b
+	ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h12_8_neon, export=1
+	load_qpel_filterb x4, x5
+	sub			x1, x1, 3
+	sub			x2, x2, 16
+	mov			x14, 112
+1:	ld2			{ v16.8b, v17.8b }, [x1], 16
+	ld1			{ v27.s }[0], [x1], x2
+	ushr		v18.2d, v16.2d, 8
+	ushr		v19.2d, v17.2d, 8
+	mov			v18.b[7], v27.b[0]
+	mov			v19.b[7], v27.b[1]
+	ushr		v20.2d, v18.2d, 8
+	ushr		v21.2d, v19.2d, 8
+	mov			v20.b[7], v27.b[2]
+	mov			v21.b[7], v27.b[3]
+	ushr		v22.2d, v20.2d, 8
+	ushr		v23.2d, v21.2d, 8
+	ushr		v24.2d, v22.2d, 8
+	movi		v28.8h, 0
+	movi		v29.8h, 0
+	calc_qpelb	v28, v16, v17, v18, v19, v20, v21, v22, v23
+	calc_qpelb	v29, v17, v18, v19, v20, v21, v22, v23, v24
+	zip1		v16.8h, v28.8h, v29.8h
+	zip2		v17.8h, v28.8h, v29.8h
+	st1			{ v16.8h }, [x0], 16
+	st1			{ v17.4h }, [x0], x14
+	subs		x3, x3, 1
+	b.ne		1b
+	ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h16_8_neon, export=1
+	load_qpel_filterb x4, x5
+	sub			x1, x1, 3
+	sub			x2, x2, 16
+	mov			x14, 128
+1:	ld2			{ v16.8b, v17.8b }, [x1], 16
+	ld1			{ v27.8b }, [x1], x2
+	ushr		v18.2d, v16.2d, 8
+	ushr		v19.2d, v17.2d, 8
+	mov			v18.b[7], v27.b[0]
+	mov			v19.b[7], v27.b[1]
+	ushr		v20.2d, v18.2d, 8
+	ushr		v21.2d, v19.2d, 8
+	mov			v20.b[7], v27.b[2]
+	mov			v21.b[7], v27.b[3]
+	ushr		v22.2d, v20.2d, 8
+	ushr		v23.2d, v21.2d, 8
+	mov			v22.b[7], v27.b[4]
+	mov			v23.b[7], v27.b[5]
+	ushr		v24.2d, v22.2d, 8
+	mov			v24.b[7], v27.b[6]
+	movi		v28.8h, 0
+	movi		v29.8h, 0
+	calc_qpelb	v28, v16, v17, v18, v19, v20, v21, v22, v23
+	calc_qpelb	v29, v17, v18, v19, v20, v21, v22, v23, v24
+	st2			{ v28.8h, v29.8h }, [x0], x14
+	subs		x3, x3, 1
+	b.ne		1b
+	ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h24_8_neon, export=1
+	load_qpel_filterb x4, x5
+	sub			x1, x1, 3
+	sub			x2, x2, 24
+	mov			x14, 128
+1:	ld3			{ v16.8b, v17.8b, v18.8b }, [x1], 24
+	ld1			{ v27.8b }, [x1], x2
+	ushr		v19.2d, v16.2d, 8
+	ushr		v20.2d, v17.2d, 8
+	ushr		v21.2d, v18.2d, 8
+	mov			v19.b[7], v27.b[0]
+	mov			v20.b[7], v27.b[1]
+	mov			v21.b[7], v27.b[2]
+	ushr		v22.2d, v19.2d, 8
+	ushr		v23.2d, v20.2d, 8
+	ushr		v24.2d, v21.2d, 8
+	mov			v22.b[7], v27.b[3]
+	mov			v23.b[7], v27.b[4]
+	mov			v24.b[7], v27.b[5]
+	ushr		v25.2d, v22.2d, 8
+	mov			v25.b[7], v27.b[6]
+	movi		v28.8h, 0
+	movi		v29.8h, 0
+	movi		v30.8h, 0
+	calc_qpelb	v28, v16, v17, v18, v19, v20, v21, v22, v23
+	calc_qpelb	v29, v17, v18, v19, v20, v21, v22, v23, v24
+	calc_qpelb	v30, v18, v19, v20, v21, v22, v23, v24, v25
+	st3			{ v28.8h, v29.8h, v30.8h }, [x0], x14
+	subs		x3, x3, 1
+	b.ne		1b
+	ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h32_8_neon, export=1
+	load_qpel_filterb x4, x5
+	sub			x1, x1, 3
+	sub			x2, x2, 32
+	mov			x14, 128
+1:	ld4			{ v16.8b, v17.8b, v18.8b, v19.8b }, [x1], 32
+	ld1			{ v27.8b }, [x1], x2
+	ushr		v20.2d, v16.2d, 8
+	ushr		v21.2d, v17.2d, 8
+	ushr		v22.2d, v18.2d, 8
+	ushr		v23.2d, v19.2d, 8
+	mov			v20.b[7], v27.b[0]
+	mov			v21.b[7], v27.b[1]
+	mov			v22.b[7], v27.b[2]
+	mov			v23.b[7], v27.b[3]
+	ushr		v24.2d, v20.2d, 8
+	ushr		v25.2d, v21.2d, 8
+	ushr		v26.2d, v22.2d, 8
+	mov			v24.b[7], v27.b[4]
+	mov			v25.b[7], v27.b[5]
+	mov			v26.b[7], v27.b[6]
+	movi		v28.8h, 0
+	movi		v29.8h, 0
+	movi		v30.8h, 0
+	movi		v31.8h, 0
+	calc_qpelb	v28, v16, v17, v18, v19, v20, v21, v22, v23
+	calc_qpelb	v29, v17, v18, v19, v20, v21, v22, v23, v24
+	calc_qpelb	v30, v18, v19, v20, v21, v22, v23, v24, v25
+	calc_qpelb	v31, v19, v20, v21, v22, v23, v24, v25, v26
+	st4			{ v28.8h, v29.8h, v30.8h, v31.8h }, [x0], x14
+	subs		x3, x3, 1
+	b.ne		1b
+	ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h48_8_neon, export=1
+	load_qpel_filterb x4, x5
+	sub			x1, x1, 3
+	sub			x2, x2, 48
+	mov			x7, 24
+	mov			x14, 80
+1:	ld3			{ v16.16b, v17.16b, v18.16b }, [x1], x7
+	ld1			{ v26.8b }, [x1], x7
+	ld1			{ v27.8b }, [x1], x2
+	ushr		v19.2d, v16.2d, 8
+	ushr		v20.2d, v17.2d, 8
+	ushr		v21.2d, v18.2d, 8
+	mov			v19.b[7], v26.b[0]
+	mov			v19.b[15], v27.b[0]
+	mov			v20.b[7], v26.b[1]
+	mov			v20.b[15], v27.b[1]
+	mov			v21.b[7], v26.b[2]
+	mov			v21.b[15], v27.b[2]
+	ushr		v22.2d, v19.2d, 8
+	ushr		v23.2d, v20.2d, 8
+	ushr		v24.2d, v21.2d, 8
+	mov			v22.b[7], v26.b[3]
+	mov			v22.b[15], v27.b[3]
+	mov			v23.b[7], v26.b[4]
+	mov			v23.b[15], v27.b[4]
+	mov			v24.b[7], v26.b[5]
+	mov			v24.b[15], v27.b[5]
+	ushr		v25.2d, v22.2d, 8
+	mov			v25.b[7], v26.b[6]
+	mov			v25.b[15], v27.b[6]
+	movi		v28.8h, 0
+	movi		v29.8h, 0
+	movi		v30.8h, 0
+	calc_qpelb	v28, v16, v17, v18, v19, v20, v21, v22, v23
+	calc_qpelb	v29, v17, v18, v19, v20, v21, v22, v23, v24
+	calc_qpelb	v30, v18, v19, v20, v21, v22, v23, v24, v25
+	st3			{ v28.8h, v29.8h, v30.8h }, [x0], 48
+	movi		v28.8h, 0
+	movi		v29.8h, 0
+	movi		v30.8h, 0
+	calc_qpelb2	v28, v16, v17, v18, v19, v20, v21, v22, v23
+	calc_qpelb2	v29, v17, v18, v19, v20, v21, v22, v23, v24
+	calc_qpelb2	v30, v18, v19, v20, v21, v22, v23, v24, v25
+	st3			{ v28.8h, v29.8h, v30.8h }, [x0], x14
+	subs		x3, x3, 1
+	b.ne		1b
+	ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h64_8_neon, export=1
+	load_qpel_filterb x4, x5
+	sub			x1, x1, 3
+	sub			x2, x2, 64
+	mov			x7, 32
+1:	ld4			{ v16.16b, v17.16b, v18.16b, v19.16b }, [x1], x7
+	ld1			{ v27.8b }, [x1], x7
+	ld1			{ v28.8b }, [x1], x2
+	ushr		v20.2d, v16.2d, 8
+	ushr		v21.2d, v17.2d, 8
+	ushr		v22.2d, v18.2d, 8
+	ushr		v23.2d, v19.2d, 8
+	mov			v20.b[7], v27.b[0]
+	mov			v21.b[7], v27.b[1]
+	mov			v22.b[7], v27.b[2]
+	mov			v23.b[7], v27.b[3]
+	mov			v20.b[15], v28.b[0]
+	mov			v21.b[15], v28.b[1]
+	mov			v22.b[15], v28.b[2]
+	mov			v23.b[15], v28.b[3]
+	ushr		v24.2d, v20.2d, 8
+	ushr		v25.2d, v21.2d, 8
+	ushr		v26.2d, v22.2d, 8
+	mov			v24.b[7], v27.b[4]
+	mov			v25.b[7], v27.b[5]
+	mov			v26.b[7], v27.b[6]
+	mov			v24.b[15], v28.b[4]
+	mov			v25.b[15], v28.b[5]
+	mov			v26.b[15], v28.b[6]
+	movi		v28.8h, 0
+	movi		v29.8h, 0
+	movi		v30.8h, 0
+	movi		v31.8h, 0
+	calc_qpelb	v28, v16, v17, v18, v19, v20, v21, v22, v23
+	calc_qpelb	v29, v17, v18, v19, v20, v21, v22, v23, v24
+	calc_qpelb	v30, v18, v19, v20, v21, v22, v23, v24, v25
+	calc_qpelb	v31, v19, v20, v21, v22, v23, v24, v25, v26
+	st4			{ v28.8h, v29.8h, v30.8h, v31.8h }, [x0], 64
+	movi		v28.8h, 0
+	movi		v29.8h, 0
+	movi		v30.8h, 0
+	movi		v31.8h, 0
+	calc_qpelb2	v28, v16, v17, v18, v19, v20, v21, v22, v23
+	calc_qpelb2	v29, v17, v18, v19, v20, v21, v22, v23, v24
+	calc_qpelb2	v30, v18, v19, v20, v21, v22, v23, v24, v25
+	calc_qpelb2	v31, v19, v20, v21, v22, v23, v24, v25, v26
+	st4			{ v28.8h, v29.8h, v30.8h, v31.8h }, [x0], 64
+	subs		x3, x3, 1
+	b.ne		1b
+	ret
+endfunc
+
+
+
+
+function ff_hevc_put_hevc_qpel_v4_8_neon, export=1
+    load_qpel_filterb x5, x4
+    sub         x1, x1, x2, lsl 1
+    sub         x1, x1, x2
+    mov         x9, 128
+    ld1         { v16.s }[0], [x1], x2
+    ld1         { v17.s }[0], [x1], x2
+    ld1         { v18.s }[0], [x1], x2
+    ld1         { v19.s }[0], [x1], x2
+    ld1         { v20.s }[0], [x1], x2
+    ld1         { v21.s }[0], [x1], x2
+    ld1         { v22.s }[0], [x1], x2
+1:  ld1         { v23.s }[0], [x1], x2
+    movi        v24.8h, 0
+    calc_qpelb  v24, v16, v17, v18, v19, v20, v21, v22, v23
+    st1         { v24.4h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.s }[0], [x1], x2
+    movi        v24.8h, 0
+    calc_qpelb  v24, v17, v18, v19, v20, v21, v22, v23, v16
+    st1         { v24.4h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v17.s }[0], [x1], x2
+    movi        v24.8h, 0
+    calc_qpelb  v24, v18, v19, v20, v21, v22, v23, v16, v17
+    st1         { v24.4h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v18.s }[0], [x1], x2
+    movi        v24.8h, 0
+    calc_qpelb  v24, v19, v20, v21, v22, v23, v16, v17, v18
+    st1         { v24.4h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v19.s }[0], [x1], x2
+    movi        v24.8h, 0
+    calc_qpelb  v24, v20, v21, v22, v23, v16, v17, v18, v19
+    st1         { v24.4h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v20.s }[0], [x1], x2
+    movi        v24.8h, 0
+    calc_qpelb  v24, v21, v22, v23, v16, v17, v18, v19, v20
+    st1         { v24.4h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v21.s }[0], [x1], x2
+    movi        v24.8h, 0
+    calc_qpelb  v24, v22, v23, v16, v17, v18, v19, v20, v21
+    st1         { v24.4h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v22.s }[0], [x1], x2
+    movi        v24.8h, 0
+    calc_qpelb  v24, v23, v16, v17, v18, v19, v20, v21, v22
+    st1         { v24.4h }, [x0], x9
+    subs        x3, x3, 1
+    b.hi        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v6_8_neon, export=1
+    load_qpel_filterb x5, x4
+    sub         x1, x1, x2, lsl 1
+    sub         x1, x1, x2
+    mov         x9, 120
+    ld1         { v16.8b }, [x1], x2
+    ld1         { v17.8b }, [x1], x2
+    ld1         { v18.8b }, [x1], x2
+    ld1         { v19.8b }, [x1], x2
+    ld1         { v20.8b }, [x1], x2
+    ld1         { v21.8b }, [x1], x2
+    ld1         { v22.8b }, [x1], x2
+1:  ld1         { v23.8b }, [x1], x2
+    movi        v24.8h, 0
+    calc_qpelb  v24, v16, v17, v18, v19, v20, v21, v22, v23
+    st1         { v24.4h }, [x0], 8
+    st1         { v24.s }[2], [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.8b }, [x1], x2
+    movi        v24.8h, 0
+    calc_qpelb  v24, v17, v18, v19, v20, v21, v22, v23, v16
+    st1         { v24.4h }, [x0], 8
+    st1         { v24.s }[2], [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v17.8b }, [x1], x2
+    movi        v24.8h, 0
+    calc_qpelb  v24, v18, v19, v20, v21, v22, v23, v16, v17
+    st1         { v24.4h }, [x0], 8
+    st1         { v24.s }[2], [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v18.8b }, [x1], x2
+    movi        v24.8h, 0
+    calc_qpelb  v24, v19, v20, v21, v22, v23, v16, v17, v18
+    st1         { v24.4h }, [x0], 8
+    st1         { v24.s }[2], [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v19.8b }, [x1], x2
+    movi        v24.8h, 0
+    calc_qpelb  v24, v20, v21, v22, v23, v16, v17, v18, v19
+    st1         { v24.4h }, [x0], 8
+    st1         { v24.s }[2], [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v20.8b }, [x1], x2
+    movi        v24.8h, 0
+    calc_qpelb  v24, v21, v22, v23, v16, v17, v18, v19, v20
+    st1         { v24.4h }, [x0], 8
+    st1         { v24.s }[2], [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v21.8b }, [x1], x2
+    movi        v24.8h, 0
+    calc_qpelb  v24, v22, v23, v16, v17, v18, v19, v20, v21
+    st1         { v24.4h }, [x0], 8
+    st1         { v24.s }[2], [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v22.8b }, [x1], x2
+    movi        v24.8h, 0
+    calc_qpelb  v24, v23, v16, v17, v18, v19, v20, v21, v22
+    st1         { v24.4h }, [x0], 8
+    st1         { v24.s }[2], [x0], x9
+    subs        x3, x3, 1
+    b.hi        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v8_8_neon, export=1
+    load_qpel_filterb x5, x4
+    sub         x1, x1, x2, lsl 1
+    sub         x1, x1, x2
+    mov         x9, 128
+    ld1         { v16.8b }, [x1], x2
+    ld1         { v17.8b }, [x1], x2
+    ld1         { v18.8b }, [x1], x2
+    ld1         { v19.8b }, [x1], x2
+    ld1         { v20.8b }, [x1], x2
+    ld1         { v21.8b }, [x1], x2
+    ld1         { v22.8b }, [x1], x2
+1:  ld1         { v23.8b }, [x1], x2
+    movi        v24.8h, 0
+    calc_qpelb  v24, v16, v17, v18, v19, v20, v21, v22, v23
+    st1         { v24.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.8b }, [x1], x2
+    movi        v24.8h, 0
+    calc_qpelb  v24, v17, v18, v19, v20, v21, v22, v23, v16
+    st1         { v24.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v17.8b }, [x1], x2
+    movi        v24.8h, 0
+    calc_qpelb  v24, v18, v19, v20, v21, v22, v23, v16, v17
+    st1         { v24.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v18.8b }, [x1], x2
+    movi        v24.8h, 0
+    calc_qpelb  v24, v19, v20, v21, v22, v23, v16, v17, v18
+    st1         { v24.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v19.8b }, [x1], x2
+    movi        v24.8h, 0
+    calc_qpelb  v24, v20, v21, v22, v23, v16, v17, v18, v19
+    st1         { v24.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v20.8b }, [x1], x2
+    movi        v24.8h, 0
+    calc_qpelb  v24, v21, v22, v23, v16, v17, v18, v19, v20
+    st1         { v24.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v21.8b }, [x1], x2
+    movi        v24.8h, 0
+    calc_qpelb  v24, v22, v23, v16, v17, v18, v19, v20, v21
+    st1         { v24.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v22.8b }, [x1], x2
+    movi        v24.8h, 0
+    calc_qpelb  v24, v23, v16, v17, v18, v19, v20, v21, v22
+    st1         { v24.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.hi        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v12_8_neon, export=1
+    load_qpel_filterb x5, x4
+    sub         x1, x1, x2, lsl 1
+    sub         x1, x1, x2
+    mov         x9, 112
+    ld1         { v16.16b }, [x1], x2
+    ld1         { v17.16b }, [x1], x2
+    ld1         { v18.16b }, [x1], x2
+    ld1         { v19.16b }, [x1], x2
+    ld1         { v20.16b }, [x1], x2
+    ld1         { v21.16b }, [x1], x2
+    ld1         { v22.16b }, [x1], x2
+1:  ld1         { v23.16b }, [x1], x2
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v16, v17, v18, v19, v20, v21, v22, v23
+    calc_qpelb2 v25, v16, v17, v18, v19, v20, v21, v22, v23
+    st1         { v24.8h }, [x0], 16
+    st1         { v25.4h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.16b }, [x1], x2
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v17, v18, v19, v20, v21, v22, v23, v16
+    calc_qpelb2 v25, v17, v18, v19, v20, v21, v22, v23, v16
+    st1         { v24.8h }, [x0], 16
+    st1         { v25.4h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v17.16b }, [x1], x2
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v18, v19, v20, v21, v22, v23, v16, v17
+    calc_qpelb2 v25, v18, v19, v20, v21, v22, v23, v16, v17
+    st1         { v24.8h }, [x0], 16
+    st1         { v25.4h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v18.16b }, [x1], x2
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v19, v20, v21, v22, v23, v16, v17, v18
+    calc_qpelb2 v25, v19, v20, v21, v22, v23, v16, v17, v18
+    st1         { v24.8h }, [x0], 16
+    st1         { v25.4h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v19.16b }, [x1], x2
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v20, v21, v22, v23, v16, v17, v18, v19
+    calc_qpelb2 v25, v20, v21, v22, v23, v16, v17, v18, v19
+    st1         { v24.8h }, [x0], 16
+    st1         { v25.4h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v20.16b }, [x1], x2
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v21, v22, v23, v16, v17, v18, v19, v20
+    calc_qpelb2 v25, v21, v22, v23, v16, v17, v18, v19, v20
+    st1         { v24.8h }, [x0], 16
+    st1         { v25.4h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v21.16b }, [x1], x2
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v22, v23, v16, v17, v18, v19, v20, v21
+    calc_qpelb2 v25, v22, v23, v16, v17, v18, v19, v20, v21
+    st1         { v24.8h }, [x0], 16
+    st1         { v25.4h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v22.16b }, [x1], x2
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v23, v16, v17, v18, v19, v20, v21, v22
+    calc_qpelb2 v25, v23, v16, v17, v18, v19, v20, v21, v22
+    st1         { v24.8h }, [x0], 16
+    st1         { v25.4h }, [x0], x9
+    subs        x3, x3, 1
+    b.hi        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v16_8_neon, export=1
+    load_qpel_filterb x5, x4
+    sub         x1, x1, x2, lsl 1
+    sub         x1, x1, x2
+    mov         x9, 128
+    ld1         { v16.16b }, [x1], x2
+    ld1         { v17.16b }, [x1], x2
+    ld1         { v18.16b }, [x1], x2
+    ld1         { v19.16b }, [x1], x2
+    ld1         { v20.16b }, [x1], x2
+    ld1         { v21.16b }, [x1], x2
+    ld1         { v22.16b }, [x1], x2
+1:  ld1         { v23.16b }, [x1], x2
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v16, v17, v18, v19, v20, v21, v22, v23
+    calc_qpelb2 v25, v16, v17, v18, v19, v20, v21, v22, v23
+    st1         { v24.8h, v25.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.16b }, [x1], x2
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v17, v18, v19, v20, v21, v22, v23, v16
+    calc_qpelb2 v25, v17, v18, v19, v20, v21, v22, v23, v16
+    st1         { v24.8h, v25.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v17.16b }, [x1], x2
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v18, v19, v20, v21, v22, v23, v16, v17
+    calc_qpelb2 v25, v18, v19, v20, v21, v22, v23, v16, v17
+    st1         { v24.8h, v25.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v18.16b }, [x1], x2
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v19, v20, v21, v22, v23, v16, v17, v18
+    calc_qpelb2 v25, v19, v20, v21, v22, v23, v16, v17, v18
+    st1         { v24.8h, v25.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v19.16b }, [x1], x2
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v20, v21, v22, v23, v16, v17, v18, v19
+    calc_qpelb2 v25, v20, v21, v22, v23, v16, v17, v18, v19
+    st1         { v24.8h, v25.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v20.16b }, [x1], x2
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v21, v22, v23, v16, v17, v18, v19, v20
+    calc_qpelb2 v25, v21, v22, v23, v16, v17, v18, v19, v20
+    st1         { v24.8h, v25.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v21.16b }, [x1], x2
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v22, v23, v16, v17, v18, v19, v20, v21
+    calc_qpelb2 v25, v22, v23, v16, v17, v18, v19, v20, v21
+    st1         { v24.8h, v25.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v22.16b }, [x1], x2
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v23, v16, v17, v18, v19, v20, v21, v22
+    calc_qpelb2 v25, v23, v16, v17, v18, v19, v20, v21, v22
+    st1         { v24.8h, v25.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.hi        1b
+2:  ret
+endfunc
+
+// todo: reads 32 bytes
+function ff_hevc_put_hevc_qpel_v24_8_neon, export=1
+    sub         sp, sp, 48
+    st1         { v8.16b, v9.16b, v10.16b }, [sp]
+    load_qpel_filterb x5, x4
+    sub         x1, x1, x2, lsl 1
+    sub         x1, x1, x2
+    mov         x9, 128
+    ld1         { v16.16b, v17.16b }, [x1], x2
+    ld1         { v18.16b, v19.16b }, [x1], x2
+    ld1         { v20.16b, v21.16b }, [x1], x2
+    ld1         { v22.16b, v23.16b }, [x1], x2
+    ld1         { v24.16b, v25.16b }, [x1], x2
+    ld1         { v26.16b, v27.16b }, [x1], x2
+    ld1         { v28.16b, v29.16b }, [x1], x2
+1:  ld1         { v30.16b, v31.16b }, [x1], x2
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    calc_qpelb  v8,  v16, v18, v20, v22, v24, v26, v28, v30
+    calc_qpelb2 v9,  v16, v18, v20, v22, v24, v26, v28, v30
+    calc_qpelb  v10, v17, v19, v21, v23, v25, v27, v29, v31
+    st1         { v8.8h, v9.8h, v10.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.16b, v17.16b }, [x1], x2
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    calc_qpelb  v8,  v18, v20, v22, v24, v26, v28, v30, v16
+    calc_qpelb2 v9,  v18, v20, v22, v24, v26, v28, v30, v16
+    calc_qpelb  v10, v19, v21, v23, v25, v27, v29, v31, v17
+    st1         { v8.8h, v9.8h, v10.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v18.16b, v19.16b }, [x1], x2
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    calc_qpelb  v8,  v20, v22, v24, v26, v28, v30, v16, v18
+    calc_qpelb2 v9,  v20, v22, v24, v26, v28, v30, v16, v18
+    calc_qpelb  v10, v21, v23, v25, v27, v29, v31, v17, v19
+    st1         { v8.8h, v9.8h, v10.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v20.16b, v21.16b }, [x1], x2
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    calc_qpelb  v8,  v22, v24, v26, v28, v30, v16, v18, v20
+    calc_qpelb2 v9,  v22, v24, v26, v28, v30, v16, v18, v20
+    calc_qpelb  v10, v23, v25, v27, v29, v31, v17, v19, v21
+    st1         { v8.8h, v9.8h, v10.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v22.16b, v23.16b }, [x1], x2
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    calc_qpelb  v8,  v24, v26, v28, v30, v16, v18, v20, v22
+    calc_qpelb2 v9,  v24, v26, v28, v30, v16, v18, v20, v22
+    calc_qpelb  v10, v25, v27, v29, v31, v17, v19, v21, v23
+    st1         { v8.8h, v9.8h, v10.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v24.16b, v25.16b }, [x1], x2
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    calc_qpelb  v8,  v26, v28, v30, v16, v18, v20, v22, v24
+    calc_qpelb2 v9,  v26, v28, v30, v16, v18, v20, v22, v24
+    calc_qpelb  v10, v27, v29, v31, v17, v19, v21, v23, v25
+    st1         { v8.8h, v9.8h, v10.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v26.16b, v27.16b }, [x1], x2
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    calc_qpelb  v8,  v28, v30, v16, v18, v20, v22, v24, v26
+    calc_qpelb2 v9,  v28, v30, v16, v18, v20, v22, v24, v26
+    calc_qpelb  v10, v29, v31, v17, v19, v21, v23, v25, v27
+    st1         { v8.8h, v9.8h, v10.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v28.16b, v29.16b }, [x1], x2
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    calc_qpelb  v8,  v30, v16, v18, v20, v22, v24, v26, v28
+    calc_qpelb2 v9,  v30, v16, v18, v20, v22, v24, v26, v28
+    calc_qpelb  v10, v31, v17, v19, v21, v23, v25, v27, v29
+    st1         { v8.8h, v9.8h, v10.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.hi        1b
+2:  ld1         { v8.16b, v9.16b, v10.16b }, [sp], 48
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v32_8_neon, export=1
+    sub         sp, sp, 64
+    st1         { v8.16b, v9.16b, v10.16b, v11.16b }, [sp]
+    load_qpel_filterb x5, x4
+    sub         x1, x1, x2, lsl 1
+    sub         x1, x1, x2
+    mov         x9, 128
+    ld1         { v16.16b, v17.16b }, [x1], x2
+    ld1         { v18.16b, v19.16b }, [x1], x2
+    ld1         { v20.16b, v21.16b }, [x1], x2
+    ld1         { v22.16b, v23.16b }, [x1], x2
+    ld1         { v24.16b, v25.16b }, [x1], x2
+    ld1         { v26.16b, v27.16b }, [x1], x2
+    ld1         { v28.16b, v29.16b }, [x1], x2
+1:  ld1         { v30.16b, v31.16b }, [x1], x2
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_qpelb  v8,  v16, v18, v20, v22, v24, v26, v28, v30
+    calc_qpelb2 v9,  v16, v18, v20, v22, v24, v26, v28, v30
+    calc_qpelb  v10, v17, v19, v21, v23, v25, v27, v29, v31
+    calc_qpelb2 v11, v17, v19, v21, v23, v25, v27, v29, v31
+    st1         { v8.8h, v9.8h, v10.8h, v11.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.16b, v17.16b }, [x1], x2
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_qpelb  v8,  v18, v20, v22, v24, v26, v28, v30, v16
+    calc_qpelb2 v9,  v18, v20, v22, v24, v26, v28, v30, v16
+    calc_qpelb  v10, v19, v21, v23, v25, v27, v29, v31, v17
+    calc_qpelb2 v11, v19, v21, v23, v25, v27, v29, v31, v17
+    st1         { v8.8h, v9.8h, v10.8h, v11.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v18.16b, v19.16b }, [x1], x2
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_qpelb  v8,  v20, v22, v24, v26, v28, v30, v16, v18
+    calc_qpelb2 v9,  v20, v22, v24, v26, v28, v30, v16, v18
+    calc_qpelb  v10, v21, v23, v25, v27, v29, v31, v17, v19
+    calc_qpelb2 v11, v21, v23, v25, v27, v29, v31, v17, v19
+    st1         { v8.8h, v9.8h, v10.8h, v11.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v20.16b, v21.16b }, [x1], x2
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_qpelb  v8,  v22, v24, v26, v28, v30, v16, v18, v20
+    calc_qpelb2 v9,  v22, v24, v26, v28, v30, v16, v18, v20
+    calc_qpelb  v10, v23, v25, v27, v29, v31, v17, v19, v21
+    calc_qpelb2 v11, v23, v25, v27, v29, v31, v17, v19, v21
+    st1         { v8.8h, v9.8h, v10.8h, v11.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v22.16b, v23.16b }, [x1], x2
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_qpelb  v8,  v24, v26, v28, v30, v16, v18, v20, v22
+    calc_qpelb2 v9,  v24, v26, v28, v30, v16, v18, v20, v22
+    calc_qpelb  v10, v25, v27, v29, v31, v17, v19, v21, v23
+    calc_qpelb2 v11, v25, v27, v29, v31, v17, v19, v21, v23
+    st1         { v8.8h, v9.8h, v10.8h, v11.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v24.16b, v25.16b }, [x1], x2
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_qpelb  v8,  v26, v28, v30, v16, v18, v20, v22, v24
+    calc_qpelb2 v9,  v26, v28, v30, v16, v18, v20, v22, v24
+    calc_qpelb  v10, v27, v29, v31, v17, v19, v21, v23, v25
+    calc_qpelb2 v11, v27, v29, v31, v17, v19, v21, v23, v25
+    st1         { v8.8h, v9.8h, v10.8h, v11.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v26.16b, v27.16b }, [x1], x2
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_qpelb  v8,  v28, v30, v16, v18, v20, v22, v24, v26
+    calc_qpelb2 v9,  v28, v30, v16, v18, v20, v22, v24, v26
+    calc_qpelb  v10, v29, v31, v17, v19, v21, v23, v25, v27
+    calc_qpelb2 v11, v29, v31, v17, v19, v21, v23, v25, v27
+    st1         { v8.8h, v9.8h, v10.8h, v11.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v28.16b, v29.16b }, [x1], x2
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_qpelb  v8,  v30, v16, v18, v20, v22, v24, v26, v28
+    calc_qpelb2 v9,  v30, v16, v18, v20, v22, v24, v26, v28
+    calc_qpelb  v10, v31, v17, v19, v21, v23, v25, v27, v29
+    calc_qpelb2 v11, v31, v17, v19, v21, v23, v25, v27, v29
+    st1         { v8.8h, v9.8h, v10.8h, v11.8h }, [x0], x9
+    subs        x3, x3, 1
+    b.hi        1b
+2:  ld1         { v8.16b, v9.16b, v10.16b, v11.16b }, [sp], 64
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v48_8_neon, export=1
+    stp         x5, x30, [sp, -16]!
+    stp         x0, x1, [sp, -16]!
+    stp         x2, x3, [sp, -16]!
+    bl          ff_hevc_put_hevc_qpel_v24_8_neon
+    ldp         x2, x3, [sp], 16
+    ldp         x0, x1, [sp], 16
+    ldr         x5, [sp]
+    add         x0, x0, 48
+    add         x1, x1, 24
+    bl          ff_hevc_put_hevc_qpel_v24_8_neon
+    ldp         xzr, x30, [sp], 16
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v64_8_neon, export=1
+    sub         sp, sp, 64
+    st1         { v8.16b, v9.16b, v10.16b, v11.16b }, [sp]
+    load_qpel_filterb x5, x4
+    sub         x1, x1, x2, lsl 1
+    sub         x1, x1, x2
+    mov         x9, 128
+1:  mov         x11, x3     // height
+    mov         x10, x0     // dst
+    mov         x8, x1      // src
+
+    ld1         { v16.16b, v17.16b }, [x8], x2
+    ld1         { v18.16b, v19.16b }, [x8], x2
+    ld1         { v20.16b, v21.16b }, [x8], x2
+    ld1         { v22.16b, v23.16b }, [x8], x2
+    ld1         { v24.16b, v25.16b }, [x8], x2
+    ld1         { v26.16b, v27.16b }, [x8], x2
+    ld1         { v28.16b, v29.16b }, [x8], x2
+2:  ld1         { v30.16b, v31.16b }, [x8], x2
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_qpelb  v8,  v16, v18, v20, v22, v24, v26, v28, v30
+    calc_qpelb2 v9,  v16, v18, v20, v22, v24, v26, v28, v30
+    calc_qpelb  v10, v17, v19, v21, v23, v25, v27, v29, v31
+    calc_qpelb2 v11, v17, v19, v21, v23, v25, v27, v29, v31
+    st1         { v8.8h, v9.8h, v10.8h, v11.8h }, [x10], x9
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v16.16b, v17.16b }, [x8], x2
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_qpelb  v8,  v18, v20, v22, v24, v26, v28, v30, v16
+    calc_qpelb2 v9,  v18, v20, v22, v24, v26, v28, v30, v16
+    calc_qpelb  v10, v19, v21, v23, v25, v27, v29, v31, v17
+    calc_qpelb2 v11, v19, v21, v23, v25, v27, v29, v31, v17
+    st1         { v8.8h, v9.8h, v10.8h, v11.8h }, [x10], x9
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v18.16b, v19.16b }, [x8], x2
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_qpelb  v8,  v20, v22, v24, v26, v28, v30, v16, v18
+    calc_qpelb2 v9,  v20, v22, v24, v26, v28, v30, v16, v18
+    calc_qpelb  v10, v21, v23, v25, v27, v29, v31, v17, v19
+    calc_qpelb2 v11, v21, v23, v25, v27, v29, v31, v17, v19
+    st1         { v8.8h, v9.8h, v10.8h, v11.8h }, [x10], x9
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v20.16b, v21.16b }, [x8], x2
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_qpelb  v8,  v22, v24, v26, v28, v30, v16, v18, v20
+    calc_qpelb2 v9,  v22, v24, v26, v28, v30, v16, v18, v20
+    calc_qpelb  v10, v23, v25, v27, v29, v31, v17, v19, v21
+    calc_qpelb2 v11, v23, v25, v27, v29, v31, v17, v19, v21
+    st1         { v8.8h, v9.8h, v10.8h, v11.8h }, [x10], x9
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v22.16b, v23.16b }, [x8], x2
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_qpelb  v8,  v24, v26, v28, v30, v16, v18, v20, v22
+    calc_qpelb2 v9,  v24, v26, v28, v30, v16, v18, v20, v22
+    calc_qpelb  v10, v25, v27, v29, v31, v17, v19, v21, v23
+    calc_qpelb2 v11, v25, v27, v29, v31, v17, v19, v21, v23
+    st1         { v8.8h, v9.8h, v10.8h, v11.8h }, [x10], x9
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v24.16b, v25.16b }, [x8], x2
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_qpelb  v8,  v26, v28, v30, v16, v18, v20, v22, v24
+    calc_qpelb2 v9,  v26, v28, v30, v16, v18, v20, v22, v24
+    calc_qpelb  v10, v27, v29, v31, v17, v19, v21, v23, v25
+    calc_qpelb2 v11, v27, v29, v31, v17, v19, v21, v23, v25
+    st1         { v8.8h, v9.8h, v10.8h, v11.8h }, [x10], x9
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v26.16b, v27.16b }, [x8], x2
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_qpelb  v8,  v28, v30, v16, v18, v20, v22, v24, v26
+    calc_qpelb2 v9,  v28, v30, v16, v18, v20, v22, v24, v26
+    calc_qpelb  v10, v29, v31, v17, v19, v21, v23, v25, v27
+    calc_qpelb2 v11, v29, v31, v17, v19, v21, v23, v25, v27
+    st1         { v8.8h, v9.8h, v10.8h, v11.8h }, [x10], x9
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v28.16b, v29.16b }, [x8], x2
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_qpelb  v8,  v30, v16, v18, v20, v22, v24, v26, v28
+    calc_qpelb2 v9,  v30, v16, v18, v20, v22, v24, v26, v28
+    calc_qpelb  v10, v31, v17, v19, v21, v23, v25, v27, v29
+    calc_qpelb2 v11, v31, v17, v19, v21, v23, v25, v27, v29
+    st1         { v8.8h, v9.8h, v10.8h, v11.8h }, [x10], x9
+    subs        x11, x11, 1
+    b.hi        2b
+
+3:  add         x0, x0, 64
+    add         x1, x1, 32
+    subs        x6, x6, 32
+    b.hi        1b
+    ld1         { v8.16b, v9.16b, v10.16b, v11.16b }, [sp], 64
+    ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_qpel_hv4_8_neon, export=1
+    add         x10, x3, 7
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x3, [sp, -16]!
+    stp         x5, x30, [sp, -16]!
+    add         x0, sp, 32
+    sub         x1, x1, x2, lsl 1
+    sub         x1, x1, x2
+    add         x3, x3, 7
+    bl          ff_hevc_put_hevc_qpel_h4_8_neon
+    ldp         x5, x30, [sp], 16
+    ldp         x0, x3, [sp], 16
+    load_qpel_filterh x5, x4
+    mov         x7, 128
+    ld1         { v16.4h }, [sp], x7
+    ld1         { v17.4h }, [sp], x7
+    ld1         { v18.4h }, [sp], x7
+    ld1         { v19.4h }, [sp], x7
+    ld1         { v20.4h }, [sp], x7
+    ld1         { v21.4h }, [sp], x7
+    ld1         { v22.4h }, [sp], x7
+1:  ld1         { v23.4h }, [sp], x7
+    calc_qpelh  v1, v16, v17, v18, v19, v20, v21, v22, v23, sqshrn
+    st1         { v1.4h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.4h }, [sp], x7
+    calc_qpelh  v1, v17, v18, v19, v20, v21, v22, v23, v16, sqshrn
+    st1         { v1.4h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v17.4h }, [sp], x7
+    calc_qpelh  v1, v18, v19, v20, v21, v22, v23, v16, v17, sqshrn
+    st1         { v1.4h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v18.4h }, [sp], x7
+    calc_qpelh  v1, v19, v20, v21, v22, v23, v16, v17, v18, sqshrn
+    st1         { v1.4h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v19.4h }, [sp], x7
+    calc_qpelh  v1, v20, v21, v22, v23, v16, v17, v18, v19, sqshrn
+    st1         { v1.4h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v20.4h }, [sp], x7
+    calc_qpelh  v1, v21, v22, v23, v16, v17, v18, v19, v20, sqshrn
+    st1         { v1.4h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v21.4h }, [sp], x7
+    calc_qpelh  v1, v22, v23, v16, v17, v18, v19, v20, v21, sqshrn
+    st1         { v1.4h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v22.4h }, [sp], x7
+    calc_qpelh  v1, v23, v16, v17, v18, v19, v20, v21, v22, sqshrn
+    st1         { v1.4h }, [x0], x7
+    subs        x3, x3, 1
+    b.hi        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv6_8_neon, export=1
+    add         x10, x3, 7
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x3, [sp, -16]!
+    stp         x5, x30, [sp, -16]!
+    add         x0, sp, 32
+    sub         x1, x1, x2, lsl 1
+    sub         x1, x1, x2
+    add         x3, x3, 7
+    bl          ff_hevc_put_hevc_qpel_h6_8_neon
+    ldp         x5, x30, [sp], 16
+    ldp         x0, x3, [sp], 16
+    load_qpel_filterh x5, x4
+    mov         x7, 128
+    mov         x8, 120
+    ld1         { v16.8h }, [sp], x7
+    ld1         { v17.8h }, [sp], x7
+    ld1         { v18.8h }, [sp], x7
+    ld1         { v19.8h }, [sp], x7
+    ld1         { v20.8h }, [sp], x7
+    ld1         { v21.8h }, [sp], x7
+    ld1         { v22.8h }, [sp], x7
+1:  ld1         { v23.8h }, [sp], x7
+    calc_qpelh  v1, v16, v17, v18, v19, v20, v21, v22, v23, sqshrn
+    calc_qpelh2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, sqshrn2
+    st1         { v1.4h }, [x0], 8
+    st1         { v1.s }[2], [x0], x8
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.8h }, [sp], x7
+    calc_qpelh  v1, v17, v18, v19, v20, v21, v22, v23, v16, sqshrn
+    calc_qpelh2 v1, v2, v17, v18, v19, v20, v21, v22, v23, v16, sqshrn2
+    st1         { v1.4h }, [x0], 8
+    st1         { v1.s }[2], [x0], x8
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v17.8h }, [sp], x7
+    calc_qpelh  v1, v18, v19, v20, v21, v22, v23, v16, v17, sqshrn
+    calc_qpelh2 v1, v2, v18, v19, v20, v21, v22, v23, v16, v17, sqshrn2
+    st1         { v1.4h }, [x0], 8
+    st1         { v1.s }[2], [x0], x8
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v18.8h }, [sp], x7
+    calc_qpelh  v1, v19, v20, v21, v22, v23, v16, v17, v18, sqshrn
+    calc_qpelh2 v1, v2, v19, v20, v21, v22, v23, v16, v17, v18, sqshrn2
+    st1         { v1.4h }, [x0], 8
+    st1         { v1.s }[2], [x0], x8
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v19.8h }, [sp], x7
+    calc_qpelh  v1, v20, v21, v22, v23, v16, v17, v18, v19, sqshrn
+    calc_qpelh2 v1, v2, v20, v21, v22, v23, v16, v17, v18, v19, sqshrn2
+    st1         { v1.4h }, [x0], 8
+    st1         { v1.s }[2], [x0], x8
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v20.8h }, [sp], x7
+    calc_qpelh  v1, v21, v22, v23, v16, v17, v18, v19, v20, sqshrn
+    calc_qpelh2 v1, v2, v21, v22, v23, v16, v17, v18, v19, v20, sqshrn2
+    st1         { v1.4h }, [x0], 8
+    st1         { v1.s }[2], [x0], x8
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v21.8h }, [sp], x7
+    calc_qpelh  v1, v22, v23, v16, v17, v18, v19, v20, v21, sqshrn
+    calc_qpelh2 v1, v2, v22, v23, v16, v17, v18, v19, v20, v21, sqshrn2
+    st1         { v1.4h }, [x0], 8
+    st1         { v1.s }[2], [x0], x8
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v22.8h }, [sp], x7
+    calc_qpelh  v1, v23, v16, v17, v18, v19, v20, v21, v22, sqshrn
+    calc_qpelh2 v1, v2, v23, v16, v17, v18, v19, v20, v21, v22, sqshrn2
+    st1         { v1.4h }, [x0], 8
+    st1         { v1.s }[2], [x0], x8
+    subs        x3, x3, 1
+    b.hi        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv8_8_neon, export=1
+    add         x10, x3, 7
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x3, [sp, -16]!
+    stp         x5, x30, [sp, -16]!
+    add         x0, sp, 32
+    sub         x1, x1, x2, lsl 1
+    sub         x1, x1, x2
+    add         x3, x3, 7
+    bl          ff_hevc_put_hevc_qpel_h8_8_neon
+    ldp         x5, x30, [sp], 16
+    ldp         x0, x3, [sp], 16
+    load_qpel_filterh x5, x4
+    mov         x7, 128
+    ld1         { v16.8h }, [sp], x7
+    ld1         { v17.8h }, [sp], x7
+    ld1         { v18.8h }, [sp], x7
+    ld1         { v19.8h }, [sp], x7
+    ld1         { v20.8h }, [sp], x7
+    ld1         { v21.8h }, [sp], x7
+    ld1         { v22.8h }, [sp], x7
+1:  ld1         { v23.8h }, [sp], x7
+    calc_qpelh  v1, v16, v17, v18, v19, v20, v21, v22, v23, sqshrn
+    calc_qpelh2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, sqshrn2
+    st1         { v1.8h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.8h }, [sp], x7
+    calc_qpelh  v1, v17, v18, v19, v20, v21, v22, v23, v16, sqshrn
+    calc_qpelh2 v1, v2, v17, v18, v19, v20, v21, v22, v23, v16, sqshrn2
+    st1         { v1.8h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v17.8h }, [sp], x7
+    calc_qpelh  v1, v18, v19, v20, v21, v22, v23, v16, v17, sqshrn
+    calc_qpelh2 v1, v2, v18, v19, v20, v21, v22, v23, v16, v17, sqshrn2
+    st1         { v1.8h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v18.8h }, [sp], x7
+    calc_qpelh  v1, v19, v20, v21, v22, v23, v16, v17, v18, sqshrn
+    calc_qpelh2 v1, v2, v19, v20, v21, v22, v23, v16, v17, v18, sqshrn2
+    st1         { v1.8h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v19.8h }, [sp], x7
+    calc_qpelh  v1, v20, v21, v22, v23, v16, v17, v18, v19, sqshrn
+    calc_qpelh2 v1, v2, v20, v21, v22, v23, v16, v17, v18, v19, sqshrn2
+    st1         { v1.8h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v20.8h }, [sp], x7
+    calc_qpelh  v1, v21, v22, v23, v16, v17, v18, v19, v20, sqshrn
+    calc_qpelh2 v1, v2, v21, v22, v23, v16, v17, v18, v19, v20, sqshrn2
+    st1         { v1.8h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v21.8h }, [sp], x7
+    calc_qpelh  v1, v22, v23, v16, v17, v18, v19, v20, v21, sqshrn
+    calc_qpelh2 v1, v2, v22, v23, v16, v17, v18, v19, v20, v21, sqshrn2
+    st1         { v1.8h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v22.8h }, [sp], x7
+    calc_qpelh  v1, v23, v16, v17, v18, v19, v20, v21, v22, sqshrn
+    calc_qpelh2 v1, v2, v23, v16, v17, v18, v19, v20, v21, v22, sqshrn2
+    st1         { v1.8h }, [x0], x7
+    subs        x3, x3, 1
+    b.hi        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv12_8_neon, export=1
+    add         x10, x3, 7
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x3, [sp, -16]!
+    stp         x5, x30, [sp, -16]!
+    add         x0, sp, 32
+    sub         x1, x1, x2, lsl 1
+    sub         x1, x1, x2
+    add         x3, x3, 7
+    bl          ff_hevc_put_hevc_qpel_h12_8_neon
+    ldp         x5, x30, [sp], 16
+    ldp         x0, x3, [sp], 16
+    load_qpel_filterh x5, x4
+    mov         x7, 128
+    mov         x8, 112
+    ld1         { v16.8h, v17.8h }, [sp], x7
+    ld1         { v18.8h, v19.8h }, [sp], x7
+    ld1         { v20.8h, v21.8h }, [sp], x7
+    ld1         { v22.8h, v23.8h }, [sp], x7
+    ld1         { v24.8h, v25.8h }, [sp], x7
+    ld1         { v26.8h, v27.8h }, [sp], x7
+    ld1         { v28.8h, v29.8h }, [sp], x7
+1:  ld1         { v30.8h, v31.8h }, [sp], x7
+    calc_qpelh  v1, v16, v18, v20, v22, v24, v26, v28, v30, sqshrn
+    calc_qpelh2 v1, v2, v16, v18, v20, v22, v24, v26, v28, v30, sqshrn2
+    calc_qpelh  v2, v17, v19, v21, v23, v25, v27, v29, v31, sqshrn
+    st1         { v1.8h }, [x0], 16
+    st1         { v2.4h }, [x0], x8
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.8h, v17.8h }, [sp], x7
+    calc_qpelh  v1, v18, v20, v22, v24, v26, v28, v30, v16, sqshrn
+    calc_qpelh2 v1, v2, v18, v20, v22, v24, v26, v28, v30, v16, sqshrn2
+    calc_qpelh  v2, v19, v21, v23, v25, v27, v29, v31, v17, sqshrn
+    st1         { v1.8h }, [x0], 16
+    st1         { v2.4h }, [x0], x8
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v18.8h, v19.8h }, [sp], x7
+    calc_qpelh  v1, v20, v22, v24, v26, v28, v30, v16, v18, sqshrn
+    calc_qpelh2 v1, v2, v20, v22, v24, v26, v28, v30, v16, v18, sqshrn2
+    calc_qpelh  v2, v21, v23, v25, v27, v29, v31, v17, v19, sqshrn
+    st1         { v1.8h }, [x0], 16
+    st1         { v2.4h }, [x0], x8
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v20.8h, v21.8h }, [sp], x7
+    calc_qpelh  v1, v22, v24, v26, v28, v30, v16, v18, v20, sqshrn
+    calc_qpelh2 v1, v2, v22, v24, v26, v28, v30, v16, v18, v20, sqshrn2
+    calc_qpelh  v2, v23, v25, v27, v29, v31, v17, v19, v21, sqshrn
+    st1         { v1.8h }, [x0], 16
+    st1         { v2.4h }, [x0], x8
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v22.8h, v23.8h }, [sp], x7
+    calc_qpelh  v1, v24, v26, v28, v30, v16, v18, v20, v22, sqshrn
+    calc_qpelh2 v1, v2, v24, v26, v28, v30, v16, v18, v20, v22, sqshrn2
+    calc_qpelh  v2, v25, v27, v29, v31, v17, v19, v21, v23, sqshrn
+    st1         { v1.8h }, [x0], 16
+    st1         { v2.4h }, [x0], x8
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v24.8h, v25.8h }, [sp], x7
+    calc_qpelh  v1, v26, v28, v30, v16, v18, v20, v22, v24, sqshrn
+    calc_qpelh2 v1, v2, v26, v28, v30, v16, v18, v20, v22, v24, sqshrn2
+    calc_qpelh  v2, v27, v29, v31, v17, v19, v21, v23, v25, sqshrn
+    st1         { v1.8h }, [x0], 16
+    st1         { v2.4h }, [x0], x8
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v26.8h, v27.8h }, [sp], x7
+    calc_qpelh  v1, v28, v30, v16, v18, v20, v22, v24, v26, sqshrn
+    calc_qpelh2 v1, v2, v28, v30, v16, v18, v20, v22, v24, v26, sqshrn2
+    calc_qpelh  v2, v29, v31, v17, v19, v21, v23, v25, v27, sqshrn
+    st1         { v1.8h }, [x0], 16
+    st1         { v2.4h }, [x0], x8
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v28.8h, v29.8h }, [sp], x7
+    calc_qpelh  v1, v30, v16, v18, v20, v22, v24, v26, v28, sqshrn
+    calc_qpelh2 v1, v2, v30, v16, v18, v20, v22, v24, v26, v28, sqshrn2
+    calc_qpelh  v2, v31, v17, v19, v21, v23, v25, v27, v29, sqshrn
+    st1         { v1.8h }, [x0], 16
+    st1         { v2.4h }, [x0], x8
+    subs        x3, x3, 1
+    b.hi        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv16_8_neon, export=1
+    add         x10, x3, 7
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x3, [sp, -16]!
+    stp         x5, x30, [sp, -16]!
+    add         x0, sp, 32
+    sub         x1, x1, x2, lsl 1
+    sub         x1, x1, x2
+    add         x3, x3, 7
+    bl          ff_hevc_put_hevc_qpel_h16_8_neon
+    ldp         x5, x30, [sp], 16
+    ldp         x0, x3, [sp], 16
+    load_qpel_filterh x5, x4
+    mov         x7, 128
+    ld1         { v16.8h, v17.8h }, [sp], x7
+    ld1         { v18.8h, v19.8h }, [sp], x7
+    ld1         { v20.8h, v21.8h }, [sp], x7
+    ld1         { v22.8h, v23.8h }, [sp], x7
+    ld1         { v24.8h, v25.8h }, [sp], x7
+    ld1         { v26.8h, v27.8h }, [sp], x7
+    ld1         { v28.8h, v29.8h }, [sp], x7
+1:  ld1         { v30.8h, v31.8h }, [sp], x7
+    calc_qpelh  v1, v16, v18, v20, v22, v24, v26, v28, v30, sqshrn
+    calc_qpelh2 v1, v2, v16, v18, v20, v22, v24, v26, v28, v30, sqshrn2
+    calc_qpelh  v2, v17, v19, v21, v23, v25, v27, v29, v31, sqshrn
+    calc_qpelh2 v2, v3, v17, v19, v21, v23, v25, v27, v29, v31, sqshrn2
+    st1         { v1.8h, v2.8h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v16.8h, v17.8h }, [sp], x7
+    calc_qpelh  v1, v18, v20, v22, v24, v26, v28, v30, v16, sqshrn
+    calc_qpelh2 v1, v2, v18, v20, v22, v24, v26, v28, v30, v16, sqshrn2
+    calc_qpelh  v2, v19, v21, v23, v25, v27, v29, v31, v17, sqshrn
+    calc_qpelh2 v2, v3, v19, v21, v23, v25, v27, v29, v31, v17, sqshrn2
+    st1         { v1.8h, v2.8h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v18.8h, v19.8h }, [sp], x7
+    calc_qpelh  v1, v20, v22, v24, v26, v28, v30, v16, v18, sqshrn
+    calc_qpelh2 v1, v2, v20, v22, v24, v26, v28, v30, v16, v18, sqshrn2
+    calc_qpelh  v2, v21, v23, v25, v27, v29, v31, v17, v19, sqshrn
+    calc_qpelh2 v2, v3, v21, v23, v25, v27, v29, v31, v17, v19, sqshrn2
+    st1         { v1.8h, v2.8h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v20.8h, v21.8h }, [sp], x7
+    calc_qpelh  v1, v22, v24, v26, v28, v30, v16, v18, v20, sqshrn
+    calc_qpelh2 v1, v2, v22, v24, v26, v28, v30, v16, v18, v20, sqshrn2
+    calc_qpelh  v2, v23, v25, v27, v29, v31, v17, v19, v21, sqshrn
+    calc_qpelh2 v2, v3, v23, v25, v27, v29, v31, v17, v19, v21, sqshrn2
+    st1         { v1.8h, v2.8h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v22.8h, v23.8h }, [sp], x7
+    calc_qpelh  v1, v24, v26, v28, v30, v16, v18, v20, v22, sqshrn
+    calc_qpelh2 v1, v2, v24, v26, v28, v30, v16, v18, v20, v22, sqshrn2
+    calc_qpelh  v2, v25, v27, v29, v31, v17, v19, v21, v23, sqshrn
+    calc_qpelh2 v2, v3, v25, v27, v29, v31, v17, v19, v21, v23, sqshrn2
+    st1         { v1.8h, v2.8h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v24.8h, v25.8h }, [sp], x7
+    calc_qpelh  v1, v26, v28, v30, v16, v18, v20, v22, v24, sqshrn
+    calc_qpelh2 v1, v2, v26, v28, v30, v16, v18, v20, v22, v24, sqshrn2
+    calc_qpelh  v2, v27, v29, v31, v17, v19, v21, v23, v25, sqshrn
+    calc_qpelh2 v2, v3, v27, v29, v31, v17, v19, v21, v23, v25, sqshrn2
+    st1         { v1.8h, v2.8h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v26.8h, v27.8h }, [sp], x7
+    calc_qpelh  v1, v28, v30, v16, v18, v20, v22, v24, v26, sqshrn
+    calc_qpelh2 v1, v2, v28, v30, v16, v18, v20, v22, v24, v26, sqshrn2
+    calc_qpelh  v2, v29, v31, v17, v19, v21, v23, v25, v27, sqshrn
+    calc_qpelh2 v2, v3, v29, v31, v17, v19, v21, v23, v25, v27, sqshrn2
+    st1         { v1.8h, v2.8h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v28.8h, v29.8h }, [sp], x7
+    calc_qpelh  v1, v30, v16, v18, v20, v22, v24, v26, v28, sqshrn
+    calc_qpelh2 v1, v2, v30, v16, v18, v20, v22, v24, v26, v28, sqshrn2
+    calc_qpelh  v2, v31, v17, v19, v21, v23, v25, v27, v29, sqshrn
+    calc_qpelh2 v2, v3, v31, v17, v19, v21, v23, v25, v27, v29, sqshrn2
+    st1         { v1.8h, v2.8h }, [x0], x7
+    subs        x3, x3, 1
+    b.hi        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv24_8_neon, export=1
+    sub         sp, sp, 64
+    st1         { v8.16b, v9.16b, v10.16b, v11.16b }, [sp]
+    sub         sp, sp, 64
+    st1         { v12.16b, v13.16b, v14.16b, v15.16b }, [sp]
+    add         x10, x3, 7
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x3, [sp, -16]!
+    stp         x5, x30, [sp, -16]!
+    add         x0, sp, 32
+    sub         x1, x1, x2, lsl 1
+    sub         x1, x1, x2
+    add         x3, x3, 7
+    bl          ff_hevc_put_hevc_qpel_h24_8_neon
+    ldp         x5, x30, [sp], 16
+    ldp         x0, x3, [sp], 16
+    load_qpel_filterh x5, x4
+    mov         x7, 128
+    ld1         { v8.8h, v9.8h, v10.8h }, [sp], x7
+    ld1         { v11.8h, v12.8h, v13.8h }, [sp], x7
+    ld1         { v14.8h, v15.8h, v16.8h }, [sp], x7
+    ld1         { v17.8h, v18.8h, v19.8h }, [sp], x7
+    ld1         { v20.8h, v21.8h, v22.8h }, [sp], x7
+    ld1         { v23.8h, v24.8h, v25.8h }, [sp], x7
+    ld1         { v26.8h, v27.8h, v28.8h }, [sp], x7
+1:  ld1         { v29.8h, v30.8h, v31.8h }, [sp], x7
+    calc_qpelh  v1, v8, v11, v14, v17, v20, v23, v26, v29, sqshrn
+    calc_qpelh2 v1, v2, v8, v11, v14, v17, v20, v23, v26, v29, sqshrn2
+    calc_qpelh  v2, v9, v12, v15, v18, v21, v24, v27, v30, sqshrn
+    calc_qpelh2 v2, v3, v9, v12, v15, v18, v21, v24, v27, v30, sqshrn2
+    calc_qpelh  v3, v10, v13, v16, v19, v22, v25, v28, v31, sqshrn
+    calc_qpelh2 v3, v4, v10, v13, v16, v19, v22, v25, v28, v31, sqshrn2
+    st1         { v1.8h, v2.8h, v3.8h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v8.8h, v9.8h, v10.8h }, [sp], x7
+    calc_qpelh  v1, v11, v14, v17, v20, v23, v26, v29, v8, sqshrn
+    calc_qpelh2 v1, v2, v11, v14, v17, v20, v23, v26, v29, v8, sqshrn2
+    calc_qpelh  v2, v12, v15, v18, v21, v24, v27, v30, v9, sqshrn
+    calc_qpelh2 v2, v3, v12, v15, v18, v21, v24, v27, v30, v9, sqshrn2
+    calc_qpelh  v3, v13, v16, v19, v22, v25, v28, v31, v10, sqshrn
+    calc_qpelh2 v3, v4, v13, v16, v19, v22, v25, v28, v31, v10, sqshrn2
+    st1         { v1.8h, v2.8h, v3.8h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v11.8h, v12.8h, v13.8h }, [sp], x7
+    calc_qpelh  v1, v14, v17, v20, v23, v26, v29, v8, v11, sqshrn
+    calc_qpelh2 v1, v2, v14, v17, v20, v23, v26, v29, v8, v11, sqshrn2
+    calc_qpelh  v2, v15, v18, v21, v24, v27, v30, v9, v12, sqshrn
+    calc_qpelh2 v2, v3, v15, v18, v21, v24, v27, v30, v9, v12, sqshrn2
+    calc_qpelh  v3, v16, v19, v22, v25, v28, v31, v10, v13, sqshrn
+    calc_qpelh2 v3, v4, v16, v19, v22, v25, v28, v31, v10, v13, sqshrn2
+    st1         { v1.8h, v2.8h, v3.8h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v14.8h, v15.8h, v16.8h }, [sp], x7
+    calc_qpelh  v1, v17, v20, v23, v26, v29, v8, v11, v14, sqshrn
+    calc_qpelh2 v1, v2, v17, v20, v23, v26, v29, v8, v11, v14, sqshrn2
+    calc_qpelh  v2, v18, v21, v24, v27, v30, v9, v12, v15, sqshrn
+    calc_qpelh2 v2, v3, v18, v21, v24, v27, v30, v9, v12, v15, sqshrn2
+    calc_qpelh  v3, v19, v22, v25, v28, v31, v10, v13, v16, sqshrn
+    calc_qpelh2 v3, v4, v19, v22, v25, v28, v31, v10, v13, v16, sqshrn2
+    st1         { v1.8h, v2.8h, v3.8h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v17.8h, v18.8h, v19.8h }, [sp], x7
+    calc_qpelh  v1, v20, v23, v26, v29, v8, v11, v14, v17, sqshrn
+    calc_qpelh2 v1, v2, v20, v23, v26, v29, v8, v11, v14, v17, sqshrn2
+    calc_qpelh  v2, v21, v24, v27, v30, v9, v12, v15, v18, sqshrn
+    calc_qpelh2 v2, v3, v21, v24, v27, v30, v9, v12, v15, v18, sqshrn2
+    calc_qpelh  v3, v22, v25, v28, v31, v10, v13, v16, v19, sqshrn
+    calc_qpelh2 v3, v4, v22, v25, v28, v31, v10, v13, v16, v19, sqshrn2
+    st1         { v1.8h, v2.8h, v3.8h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v20.8h, v21.8h, v22.8h }, [sp], x7
+    calc_qpelh  v1, v23, v26, v29, v8, v11, v14, v17, v20, sqshrn
+    calc_qpelh2 v1, v2, v23, v26, v29, v8, v11, v14, v17, v20, sqshrn2
+    calc_qpelh  v2, v24, v27, v30, v9, v12, v15, v18, v21, sqshrn
+    calc_qpelh2 v2, v3, v24, v27, v30, v9, v12, v15, v18, v21, sqshrn2
+    calc_qpelh  v3, v25, v28, v31, v10, v13, v16, v19, v22, sqshrn
+    calc_qpelh2 v3, v4, v25, v28, v31, v10, v13, v16, v19, v22, sqshrn2
+    st1         { v1.8h, v2.8h, v3.8h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v23.8h, v24.8h, v25.8h }, [sp], x7
+    calc_qpelh  v1, v26, v29, v8, v11, v14, v17, v20, v23, sqshrn
+    calc_qpelh2 v1, v2, v26, v29, v8, v11, v14, v17, v20, v23, sqshrn2
+    calc_qpelh  v2, v27, v30, v9, v12, v15, v18, v21, v24, sqshrn
+    calc_qpelh2 v2, v3, v27, v30, v9, v12, v15, v18, v21, v24, sqshrn2
+    calc_qpelh  v3, v28, v31, v10, v13, v16, v19, v22, v25, sqshrn
+    calc_qpelh2 v3, v4, v28, v31, v10, v13, v16, v19, v22, v25, sqshrn2
+    st1         { v1.8h, v2.8h, v3.8h }, [x0], x7
+    subs        x3, x3, 1
+    b.eq        2f
+
+    ld1         { v26.8h, v27.8h, v28.8h }, [sp], x7
+    calc_qpelh  v1, v29, v8, v11, v14, v17, v20, v23, v26, sqshrn
+    calc_qpelh2 v1, v2, v29, v8, v11, v14, v17, v20, v23, v26, sqshrn2
+    calc_qpelh  v2, v30, v9, v12, v15, v18, v21, v24, v27, sqshrn
+    calc_qpelh2 v2, v3, v30, v9, v12, v15, v18, v21, v24, v27, sqshrn2
+    calc_qpelh  v3, v31, v10, v13, v16, v19, v22, v25, v28, sqshrn
+    calc_qpelh2 v3, v4, v31, v10, v13, v16, v19, v22, v25, v28, sqshrn2
+    st1         { v1.8h, v2.8h, v3.8h }, [x0], x7
+    subs        x3, x3, 1
+    b.hi        1b
+2:  ld1         { v12.16b, v13.16b, v14.16b, v15.16b }, [sp], 64
+    ld1         { v8.16b, v9.16b, v10.16b, v11.16b }, [sp], 64
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv32_8_neon, export=1
+    add         x10, x3, 7
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x3, [sp, -16]!
+    stp         x5, x30, [sp, -16]!
+    add         x0, sp, 32
+    sub         x1, x1, x2, lsl 1
+    sub         x1, x1, x2
+    add         x3, x3, 7
+    bl          ff_hevc_put_hevc_qpel_h32_8_neon
+    ldp         x5, x30, [sp], 16
+    ldp         x0, x3, [sp], 16
+    load_qpel_filterh x5, x4
+    mov         x7, 128
+1:  mov         x9, x3      // height
+    mov         x5, x0      // dst
+    mov         x8, sp      // src
+
+    ld1         { v16.8h, v17.8h }, [x8], x7
+    ld1         { v18.8h, v19.8h }, [x8], x7
+    ld1         { v20.8h, v21.8h }, [x8], x7
+    ld1         { v22.8h, v23.8h }, [x8], x7
+    ld1         { v24.8h, v25.8h }, [x8], x7
+    ld1         { v26.8h, v27.8h }, [x8], x7
+    ld1         { v28.8h, v29.8h }, [x8], x7
+2:  ld1         { v30.8h, v31.8h }, [x8], x7
+    calc_qpelh  v1, v16, v18, v20, v22, v24, v26, v28, v30, sqshrn
+    calc_qpelh2 v1, v2, v16, v18, v20, v22, v24, v26, v28, v30, sqshrn2
+    calc_qpelh  v2, v17, v19, v21, v23, v25, v27, v29, v31, sqshrn
+    calc_qpelh2 v2, v3, v17, v19, v21, v23, v25, v27, v29, v31, sqshrn2
+    st1         { v1.8h, v2.8h }, [x5], x7
+    subs        x9, x9, 1
+    b.eq        3f
+
+    ld1         { v16.8h, v17.8h }, [x8], x7
+    calc_qpelh  v1, v18, v20, v22, v24, v26, v28, v30, v16, sqshrn
+    calc_qpelh2 v1, v2, v18, v20, v22, v24, v26, v28, v30, v16, sqshrn2
+    calc_qpelh  v2, v19, v21, v23, v25, v27, v29, v31, v17, sqshrn
+    calc_qpelh2 v2, v3, v19, v21, v23, v25, v27, v29, v31, v17, sqshrn2
+    st1         { v1.8h, v2.8h }, [x5], x7
+    subs        x9, x9, 1
+    b.eq        3f
+
+    ld1         { v18.8h, v19.8h }, [x8], x7
+    calc_qpelh  v1, v20, v22, v24, v26, v28, v30, v16, v18, sqshrn
+    calc_qpelh2 v1, v2, v20, v22, v24, v26, v28, v30, v16, v18, sqshrn2
+    calc_qpelh  v2, v21, v23, v25, v27, v29, v31, v17, v19, sqshrn
+    calc_qpelh2 v2, v3, v21, v23, v25, v27, v29, v31, v17, v19, sqshrn2
+    st1         { v1.8h, v2.8h }, [x5], x7
+    subs        x9, x9, 1
+    b.eq        3f
+
+    ld1         { v20.8h, v21.8h }, [x8], x7
+    calc_qpelh  v1, v22, v24, v26, v28, v30, v16, v18, v20, sqshrn
+    calc_qpelh2 v1, v2, v22, v24, v26, v28, v30, v16, v18, v20, sqshrn2
+    calc_qpelh  v2, v23, v25, v27, v29, v31, v17, v19, v21, sqshrn
+    calc_qpelh2 v2, v3, v23, v25, v27, v29, v31, v17, v19, v21, sqshrn2
+    st1         { v1.8h, v2.8h }, [x5], x7
+    subs        x9, x9, 1
+    b.eq        3f
+
+    ld1         { v22.8h, v23.8h }, [x8], x7
+    calc_qpelh  v1, v24, v26, v28, v30, v16, v18, v20, v22, sqshrn
+    calc_qpelh2 v1, v2, v24, v26, v28, v30, v16, v18, v20, v22, sqshrn2
+    calc_qpelh  v2, v25, v27, v29, v31, v17, v19, v21, v23, sqshrn
+    calc_qpelh2 v2, v3, v25, v27, v29, v31, v17, v19, v21, v23, sqshrn2
+    st1         { v1.8h, v2.8h }, [x5], x7
+    subs        x9, x9, 1
+    b.eq        3f
+
+    ld1         { v24.8h, v25.8h }, [x8], x7
+    calc_qpelh  v1, v26, v28, v30, v16, v18, v20, v22, v24, sqshrn
+    calc_qpelh2 v1, v2, v26, v28, v30, v16, v18, v20, v22, v24, sqshrn2
+    calc_qpelh  v2, v27, v29, v31, v17, v19, v21, v23, v25, sqshrn
+    calc_qpelh2 v2, v3, v27, v29, v31, v17, v19, v21, v23, v25, sqshrn2
+    st1         { v1.8h, v2.8h }, [x5], x7
+    subs        x9, x9, 1
+    b.eq        3f
+
+    ld1         { v26.8h, v27.8h }, [x8], x7
+    calc_qpelh  v1, v28, v30, v16, v18, v20, v22, v24, v26, sqshrn
+    calc_qpelh2 v1, v2, v28, v30, v16, v18, v20, v22, v24, v26, sqshrn2
+    calc_qpelh  v2, v29, v31, v17, v19, v21, v23, v25, v27, sqshrn
+    calc_qpelh2 v2, v3, v29, v31, v17, v19, v21, v23, v25, v27, sqshrn2
+    st1         { v1.8h, v2.8h }, [x5], x7
+    subs        x9, x9, 1
+    b.eq        3f
+
+    ld1         { v28.8h, v29.8h }, [x8], x7
+    calc_qpelh  v1, v30, v16, v18, v20, v22, v24, v26, v28, sqshrn
+    calc_qpelh2 v1, v2, v30, v16, v18, v20, v22, v24, v26, v28, sqshrn2
+    calc_qpelh  v2, v31, v17, v19, v21, v23, v25, v27, v29, sqshrn
+    calc_qpelh2 v2, v3, v31, v17, v19, v21, v23, v25, v27, v29, sqshrn2
+    st1         { v1.8h, v2.8h }, [x5], x7
+    subs        x9, x9, 1
+    b.hi        2b
+
+3:  add         x0, x0, 32
+    add         sp, sp, 32
+    subs        x6, x6, 16
+    b.hi        1b
+
+    add         sp, sp, 64      // discard rest of first line
+    add         x10, x3, 6
+    lsl         x10, x10, 7
+    add         sp, sp, x10     // tmp_array without first line
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv48_8_neon, export=1
+    stp         xzr, x30, [sp, -16]!
+    stp         x0, x1, [sp, -16]!
+    stp         x2, x3, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    bl          ff_hevc_put_hevc_qpel_hv24_8_neon
+    ldp         x4, x5, [sp], 16
+    ldp         x2, x3, [sp], 16
+    ldp         x0, x1, [sp], 16
+    add         x0, x0, 48
+    add         x1, x1, 24
+    bl          ff_hevc_put_hevc_qpel_hv24_8_neon
+    ldp         xzr, x30, [sp], 16
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv64_8_neon, export=1
+    stp         xzr, x30, [sp, -16]!
+    stp         x0, x1, [sp, -16]!
+    stp         x2, x3, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    mov         x6, 32
+    bl          ff_hevc_put_hevc_qpel_hv32_8_neon
+    ldp         x4, x5, [sp], 16
+    ldp         x2, x3, [sp], 16
+    ldp         x0, x1, [sp], 16
+    add         x0, x0, 64
+    add         x1, x1, 32
+    mov         x6, 32
+    bl          ff_hevc_put_hevc_qpel_hv32_8_neon
+    ldp         xzr, x30, [sp], 16
+    ret
+endfunc
+
+
+
+
+function ff_hevc_put_hevc_qpel_uni_h4_8_neon, export=1
+    load_qpel_filterb x5, x6
+    sub         x2, x2, 3
+1:  ld1         { v16.8b, v17.8b }, [x2], x3
+    movi        v20.8h, 0
+    umlsl       v20.8h, v16.8b, v0.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[0]
+    umlal       v20.8h, v16.8b, v1.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[1]
+    umlsl       v20.8h, v16.8b, v2.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[2]
+    umlal       v20.8h, v16.8b, v3.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[3]
+    umlal       v20.8h, v16.8b, v4.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[4]
+    umlsl       v20.8h, v16.8b, v5.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[5]
+    umlal       v20.8h, v16.8b, v6.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[6]
+    umlsl       v20.8h, v16.8b, v7.8b
+    sqrshrun    v20.8b, v20.8h, 6
+    st1         { v20.s }[0], [x0], x1
+    subs        x4, x4, 1
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h6_8_neon, export=1
+    load_qpel_filterb x5, x6
+    sub         x2, x2, 3
+    sub         x1, x1, 4
+1:  ld1         { v16.8b, v17.8b }, [x2], x3
+    movi        v20.8h, 0
+    umlsl       v20.8h, v16.8b, v0.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[0]
+    umlal       v20.8h, v16.8b, v1.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[1]
+    umlsl       v20.8h, v16.8b, v2.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[2]
+    umlal       v20.8h, v16.8b, v3.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[3]
+    umlal       v20.8h, v16.8b, v4.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[4]
+    umlsl       v20.8h, v16.8b, v5.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[5]
+    umlal       v20.8h, v16.8b, v6.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[6]
+    umlsl       v20.8h, v16.8b, v7.8b
+    sqrshrun    v20.8b, v20.8h, 6
+    st1         { v20.s }[0], [x0], 4
+    st1         { v20.h }[2], [x0], x1
+    subs        x4, x4, 1
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h8_8_neon, export=1
+    load_qpel_filterb x5, x6
+    sub         x2, x2, 3
+1:  ld1         { v16.8b, v17.8b }, [x2], x3
+    movi        v20.8h, 0
+    umlsl       v20.8h, v16.8b, v0.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[0]
+    umlal       v20.8h, v16.8b, v1.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[1]
+    umlsl       v20.8h, v16.8b, v2.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[2]
+    umlal       v20.8h, v16.8b, v3.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[3]
+    umlal       v20.8h, v16.8b, v4.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[4]
+    umlsl       v20.8h, v16.8b, v5.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[5]
+    umlal       v20.8h, v16.8b, v6.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[6]
+    umlsl       v20.8h, v16.8b, v7.8b
+    sqrshrun    v20.8b, v20.8h, 6
+    st1         { v20.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h12_8_neon, export=1
+    load_qpel_filterb x5, x6
+    sub         x2, x2, 3
+    sub         x1, x1, 8
+1:  ld2         { v16.8b, v17.8b }, [x2]
+    ldr         w12, [x2, 16]
+    movi        v20.8h, 0
+    movi        v21.8h, 0
+    umlsl       v20.8h, v16.8b, v0.8b
+    umlal       v20.8h, v17.8b, v1.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    lsr         x12, x12, 8
+    umlsl       v21.8h, v17.8b, v0.8b
+    umlal       v21.8h, v16.8b, v1.8b
+    ushr        v17.2d, v17.2d, 8
+    mov         v17.b[7], w12
+    lsr         x12, x12, 8
+    umlsl       v20.8h, v16.8b, v2.8b
+    umlal       v20.8h, v17.8b, v3.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    umlsl       v21.8h, v17.8b, v2.8b
+    umlal       v21.8h, v16.8b, v3.8b
+    ushr        v17.2d, v17.2d, 8
+    umlal       v20.8h, v16.8b, v4.8b
+    umlsl       v20.8h, v17.8b, v5.8b
+    ushr        v16.2d, v16.2d, 8
+    umlal       v21.8h, v17.8b, v4.8b
+    umlsl       v21.8h, v16.8b, v5.8b
+    ushr        v17.2d, v17.2d, 8
+    umlal       v20.8h, v16.8b, v6.8b
+    umlsl       v20.8h, v17.8b, v7.8b
+    ushr        v16.2d, v16.2d, 8
+    umlal       v21.8h, v17.8b, v6.8b
+    umlsl       v21.8h, v16.8b, v7.8b
+    zip1        v16.8h, v20.8h, v21.8h
+    zip2        v17.8h, v20.8h, v21.8h
+    sqrshrun    v20.8b, v16.8h, 6
+    sqrshrun2   v20.16b, v17.8h, 6
+    st1         { v20.8b }, [x0], 8
+    st1         { v20.s }[2], [x0], x1
+    add         x2, x2, x3
+    subs        x4, x4, 1
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h16_8_neon, export=1
+    load_qpel_filterb x5, x6
+    sub         x2, x2, 3
+1:  ld2         { v16.8b, v17.8b }, [x2]
+    ldr         x12, [x2, 16]
+    movi        v20.8h, 0
+    movi        v21.8h, 0
+    umlsl       v20.8h, v16.8b, v0.8b
+    umlal       v20.8h, v17.8b, v1.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    lsr         x12, x12, 8
+    umlsl       v21.8h, v17.8b, v0.8b
+    umlal       v21.8h, v16.8b, v1.8b
+    ushr        v17.2d, v17.2d, 8
+    mov         v17.b[7], w12
+    lsr         x12, x12, 8
+    umlsl       v20.8h, v16.8b, v2.8b
+    umlal       v20.8h, v17.8b, v3.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    lsr         x12, x12, 8
+    umlsl       v21.8h, v17.8b, v2.8b
+    umlal       v21.8h, v16.8b, v3.8b
+    ushr        v17.2d, v17.2d, 8
+    mov         v17.b[7], w12
+    lsr         x12, x12, 8
+    umlal       v20.8h, v16.8b, v4.8b
+    umlsl       v20.8h, v17.8b, v5.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    lsr         x12, x12, 8
+    umlal       v21.8h, v17.8b, v4.8b
+    umlsl       v21.8h, v16.8b, v5.8b
+    ushr        v17.2d, v17.2d, 8
+    mov         v17.b[7], w12
+    lsr         x12, x12, 8
+    umlal       v20.8h, v16.8b, v6.8b
+    umlsl       v20.8h, v17.8b, v7.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    umlal       v21.8h, v17.8b, v6.8b
+    umlsl       v21.8h, v16.8b, v7.8b
+    sqrshrun    v20.8b, v20.8h, 6
+    sqrshrun    v21.8b, v21.8h, 6
+    st2         { v20.8b, v21.8b }, [x0], x1
+    add         x2, x2, x3
+    subs        x4, x4, 1
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h24_8_neon, export=1
+    load_qpel_filterb x5, x6
+    sub         x2, x2, 3
+1:  ld3         { v16.8b, v17.8b, v18.8b }, [x2]
+    ldr         x12, [x2, 24]
+    movi        v20.8h, 0
+    movi        v21.8h, 0
+    movi        v22.8h, 0
+    umlsl       v20.8h, v16.8b, v0.8b
+    umlal       v20.8h, v17.8b, v1.8b
+    umlsl       v20.8h, v18.8b, v2.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    lsr         x12, x12, 8
+    umlsl       v21.8h, v17.8b, v0.8b
+    umlal       v21.8h, v18.8b, v1.8b
+    umlsl       v21.8h, v16.8b, v2.8b
+    ushr        v17.2d, v17.2d, 8
+    mov         v17.b[7], w12
+    lsr         x12, x12, 8
+    umlsl       v22.8h, v18.8b, v0.8b
+    umlal       v22.8h, v16.8b, v1.8b
+    umlsl       v22.8h, v17.8b, v2.8b
+    ushr        v18.2d, v18.2d, 8
+    mov         v18.b[7], w12
+    lsr         x12, x12, 8
+    umlal       v20.8h, v16.8b, v3.8b
+    umlal       v20.8h, v17.8b, v4.8b
+    umlsl       v20.8h, v18.8b, v5.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    lsr         x12, x12, 8
+    umlal       v21.8h, v17.8b, v3.8b
+    umlal       v21.8h, v18.8b, v4.8b
+    umlsl       v21.8h, v16.8b, v5.8b
+    ushr        v17.2d, v17.2d, 8
+    mov         v17.b[7], w12
+    lsr         x12, x12, 8
+    umlal       v22.8h, v18.8b, v3.8b
+    umlal       v22.8h, v16.8b, v4.8b
+    umlsl       v22.8h, v17.8b, v5.8b
+    ushr        v18.2d, v18.2d, 8
+    mov         v18.b[7], w12
+    lsr         x12, x12, 8
+    umlal       v20.8h, v16.8b, v6.8b
+    umlsl       v20.8h, v17.8b, v7.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    umlal       v21.8h, v17.8b, v6.8b
+    umlsl       v21.8h, v18.8b, v7.8b
+    umlal       v22.8h, v18.8b, v6.8b
+    umlsl       v22.8h, v16.8b, v7.8b
+    sqrshrun    v20.8b, v20.8h, 6
+    sqrshrun    v21.8b, v21.8h, 6
+    sqrshrun    v22.8b, v22.8h, 6
+    st3         { v20.8b, v21.8b, v22.8b }, [x0], x1
+    add         x2, x2, x3
+    subs        x4, x4, 1
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h32_8_neon, export=1
+    load_qpel_filterb x5, x6
+    sub         x2, x2, 3
+1:  ld4         { v16.8b, v17.8b, v18.8b, v19.8b }, [x2]
+    ldr         x12, [x2, 32]
+    movi        v20.8h, 0
+    movi        v21.8h, 0
+    movi        v22.8h, 0
+    movi        v23.8h, 0
+    umlsl       v20.8h, v16.8b, v0.8b
+    umlal       v20.8h, v17.8b, v1.8b
+    umlsl       v20.8h, v18.8b, v2.8b
+    umlal       v20.8h, v19.8b, v3.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    lsr         x12, x12, 8
+    umlsl       v21.8h, v17.8b, v0.8b
+    umlal       v21.8h, v18.8b, v1.8b
+    umlsl       v21.8h, v19.8b, v2.8b
+    umlal       v21.8h, v16.8b, v3.8b
+    ushr        v17.2d, v17.2d, 8
+    mov         v17.b[7], w12
+    lsr         x12, x12, 8
+    umlsl       v22.8h, v18.8b, v0.8b
+    umlal       v22.8h, v19.8b, v1.8b
+    umlsl       v22.8h, v16.8b, v2.8b
+    umlal       v22.8h, v17.8b, v3.8b
+    ushr        v18.2d, v18.2d, 8
+    mov         v18.b[7], w12
+    lsr         x12, x12, 8
+    umlsl       v23.8h, v19.8b, v0.8b
+    umlal       v23.8h, v16.8b, v1.8b
+    umlsl       v23.8h, v17.8b, v2.8b
+    umlal       v23.8h, v18.8b, v3.8b
+    ushr        v19.2d, v19.2d, 8
+    mov         v19.b[7], w12
+    lsr         x12, x12, 8
+    umlal       v20.8h, v16.8b, v4.8b
+    umlsl       v20.8h, v17.8b, v5.8b
+    umlal       v20.8h, v18.8b, v6.8b
+    umlsl       v20.8h, v19.8b, v7.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    lsr         x12, x12, 8
+    umlal       v21.8h, v17.8b, v4.8b
+    umlsl       v21.8h, v18.8b, v5.8b
+    umlal       v21.8h, v19.8b, v6.8b
+    umlsl       v21.8h, v16.8b, v7.8b
+    ushr        v17.2d, v17.2d, 8
+    mov         v17.b[7], w12
+    lsr         x12, x12, 8
+    umlal       v22.8h, v18.8b, v4.8b
+    umlsl       v22.8h, v19.8b, v5.8b
+    umlal       v22.8h, v16.8b, v6.8b
+    umlsl       v22.8h, v17.8b, v7.8b
+    ushr        v18.2d, v18.2d, 8
+    mov         v18.b[7], w12
+    umlal       v23.8h, v19.8b, v4.8b
+    umlsl       v23.8h, v16.8b, v5.8b
+    umlal       v23.8h, v17.8b, v6.8b
+    umlsl       v23.8h, v18.8b, v7.8b
+    sqrshrun    v20.8b, v20.8h, 6
+    sqrshrun    v21.8b, v21.8h, 6
+    sqrshrun    v22.8b, v22.8h, 6
+    sqrshrun    v23.8b, v23.8h, 6
+    st4         { v20.8b, v21.8b, v22.8b, v23.8b }, [x0], x1
+    add         x2, x2, x3
+    subs        x4, x4, 1
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h48_8_neon, export=1
+    load_qpel_filterb x5, x6
+    sub         x2, x2, 3
+1:  ld3         { v16.16b, v17.16b, v18.16b }, [x2]
+    ldr         x12, [x2, 24]
+    ldr         x13, [x2, 48]
+    movi        v20.8h, 0
+    movi        v21.8h, 0
+    movi        v22.8h, 0
+    movi        v23.8h, 0
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    umlsl       v20.8h, v16.8b, v0.8b
+    umlal       v20.8h, v17.8b, v1.8b
+    umlsl       v20.8h, v18.8b, v2.8b
+    umlsl2      v23.8h, v16.16b, v0.16b
+    umlal2      v23.8h, v17.16b, v1.16b
+    umlsl2      v23.8h, v18.16b, v2.16b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    mov         v16.b[15], w13
+    lsr         x12, x12, 8
+    lsr         x13, x13, 8
+    umlsl       v21.8h, v17.8b, v0.8b
+    umlal       v21.8h, v18.8b, v1.8b
+    umlsl       v21.8h, v16.8b, v2.8b
+    umlsl2      v24.8h, v17.16b, v0.16b
+    umlal2      v24.8h, v18.16b, v1.16b
+    umlsl2      v24.8h, v16.16b, v2.16b
+    ushr        v17.2d, v17.2d, 8
+    mov         v17.b[7], w12
+    mov         v17.b[15], w13
+    lsr         x12, x12, 8
+    lsr         x13, x13, 8
+    umlsl       v22.8h, v18.8b, v0.8b
+    umlal       v22.8h, v16.8b, v1.8b
+    umlsl       v22.8h, v17.8b, v2.8b
+    umlsl2      v25.8h, v18.16b, v0.16b
+    umlal2      v25.8h, v16.16b, v1.16b
+    umlsl2      v25.8h, v17.16b, v2.16b
+    ushr        v18.2d, v18.2d, 8
+    mov         v18.b[7], w12
+    mov         v18.b[15], w13
+    lsr         x12, x12, 8
+    lsr         x13, x13, 8
+    umlal       v20.8h, v16.8b, v3.8b
+    umlal       v20.8h, v17.8b, v4.8b
+    umlsl       v20.8h, v18.8b, v5.8b
+    umlal2      v23.8h, v16.16b, v3.16b
+    umlal2      v23.8h, v17.16b, v4.16b
+    umlsl2      v23.8h, v18.16b, v5.16b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    mov         v16.b[15], w13
+    lsr         x12, x12, 8
+    lsr         x13, x13, 8
+    umlal       v21.8h, v17.8b, v3.8b
+    umlal       v21.8h, v18.8b, v4.8b
+    umlsl       v21.8h, v16.8b, v5.8b
+    umlal2      v24.8h, v17.16b, v3.16b
+    umlal2      v24.8h, v18.16b, v4.16b
+    umlsl2      v24.8h, v16.16b, v5.16b
+    ushr        v17.2d, v17.2d, 8
+    mov         v17.b[7], w12
+    mov         v17.b[15], w13
+    lsr         x12, x12, 8
+    lsr         x13, x13, 8
+    umlal       v22.8h, v18.8b, v3.8b
+    umlal       v22.8h, v16.8b, v4.8b
+    umlsl       v22.8h, v17.8b, v5.8b
+    umlal2      v25.8h, v18.16b, v3.16b
+    umlal2      v25.8h, v16.16b, v4.16b
+    umlsl2      v25.8h, v17.16b, v5.16b
+    ushr        v18.2d, v18.2d, 8
+    mov         v18.b[7], w12
+    mov         v18.b[15], w13
+    lsr         x12, x12, 8
+    lsr         x13, x13, 8
+    umlal       v20.8h, v16.8b, v6.8b
+    umlsl       v20.8h, v17.8b, v7.8b
+    umlal2      v23.8h, v16.16b, v6.16b
+    umlsl2      v23.8h, v17.16b, v7.16b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    mov         v16.b[15], w13
+    umlal       v21.8h, v17.8b, v6.8b
+    umlsl       v21.8h, v18.8b, v7.8b
+    umlal2      v24.8h, v17.16b, v6.16b
+    umlsl2      v24.8h, v18.16b, v7.16b
+    umlal       v22.8h, v18.8b, v6.8b
+    umlsl       v22.8h, v16.8b, v7.8b
+    umlal2      v25.8h, v18.16b, v6.16b
+    umlsl2      v25.8h, v16.16b, v7.16b
+    sqrshrun    v20.8b, v20.8h, 6
+    sqrshrun    v21.8b, v21.8h, 6
+    sqrshrun    v22.8b, v22.8h, 6
+    sqrshrun2   v20.16b, v23.8h, 6
+    sqrshrun2   v21.16b, v24.8h, 6
+    sqrshrun2   v22.16b, v25.8h, 6
+    st3         { v20.16b, v21.16b, v22.16b }, [x0], x1
+    add         x2, x2, x3
+    subs        x4, x4, 1
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h64_8_neon, export=1
+    load_qpel_filterb x5, x6
+    sub         x2, x2, 3
+1:  ld4         { v16.16b, v17.16b, v18.16b, v19.16b }, [x2]
+    ldr         x12, [x2, 32]
+    ldr         x13, [x2, 64]
+    movi        v20.8h, 0
+    movi        v21.8h, 0
+    movi        v22.8h, 0
+    movi        v23.8h, 0
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    movi        v26.8h, 0
+    movi        v27.8h, 0
+    umlsl       v20.8h, v16.8b, v0.8b
+    umlal       v20.8h, v17.8b, v1.8b
+    umlsl       v20.8h, v18.8b, v2.8b
+    umlal       v20.8h, v19.8b, v3.8b
+    umlsl2      v24.8h, v16.16b, v0.16b
+    umlal2      v24.8h, v17.16b, v1.16b
+    umlsl2      v24.8h, v18.16b, v2.16b
+    umlal2      v24.8h, v19.16b, v3.16b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    lsr         x12, x12, 8
+    mov         v16.b[15], w13
+    lsr         x13, x13, 8
+    umlsl       v21.8h, v17.8b, v0.8b
+    umlal       v21.8h, v18.8b, v1.8b
+    umlsl       v21.8h, v19.8b, v2.8b
+    umlal       v21.8h, v16.8b, v3.8b
+    umlsl2      v25.8h, v17.16b, v0.16b
+    umlal2      v25.8h, v18.16b, v1.16b
+    umlsl2      v25.8h, v19.16b, v2.16b
+    umlal2      v25.8h, v16.16b, v3.16b
+    ushr        v17.2d, v17.2d, 8
+    mov         v17.b[7], w12
+    lsr         x12, x12, 8
+    mov         v17.b[15], w13
+    lsr         x13, x13, 8
+    umlsl       v22.8h, v18.8b, v0.8b
+    umlal       v22.8h, v19.8b, v1.8b
+    umlsl       v22.8h, v16.8b, v2.8b
+    umlal       v22.8h, v17.8b, v3.8b
+    umlsl2      v26.8h, v18.16b, v0.16b
+    umlal2      v26.8h, v19.16b, v1.16b
+    umlsl2      v26.8h, v16.16b, v2.16b
+    umlal2      v26.8h, v17.16b, v3.16b
+    ushr        v18.2d, v18.2d, 8
+    mov         v18.b[7], w12
+    lsr         x12, x12, 8
+    mov         v18.b[15], w13
+    lsr         x13, x13, 8
+    umlsl       v23.8h, v19.8b, v0.8b
+    umlal       v23.8h, v16.8b, v1.8b
+    umlsl       v23.8h, v17.8b, v2.8b
+    umlal       v23.8h, v18.8b, v3.8b
+    umlsl2      v27.8h, v19.16b, v0.16b
+    umlal2      v27.8h, v16.16b, v1.16b
+    umlsl2      v27.8h, v17.16b, v2.16b
+    umlal2      v27.8h, v18.16b, v3.16b
+    ushr        v19.2d, v19.2d, 8
+    mov         v19.b[7], w12
+    lsr         x12, x12, 8
+    mov         v19.b[15], w13
+    lsr         x13, x13, 8
+    umlal       v20.8h, v16.8b, v4.8b
+    umlsl       v20.8h, v17.8b, v5.8b
+    umlal       v20.8h, v18.8b, v6.8b
+    umlsl       v20.8h, v19.8b, v7.8b
+    umlal2      v24.8h, v16.16b, v4.16b
+    umlsl2      v24.8h, v17.16b, v5.16b
+    umlal2      v24.8h, v18.16b, v6.16b
+    umlsl2      v24.8h, v19.16b, v7.16b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    lsr         x12, x12, 8
+    mov         v16.b[15], w13
+    lsr         x13, x13, 8
+    umlal       v21.8h, v17.8b, v4.8b
+    umlsl       v21.8h, v18.8b, v5.8b
+    umlal       v21.8h, v19.8b, v6.8b
+    umlsl       v21.8h, v16.8b, v7.8b
+    umlal2      v25.8h, v17.16b, v4.16b
+    umlsl2      v25.8h, v18.16b, v5.16b
+    umlal2      v25.8h, v19.16b, v6.16b
+    umlsl2      v25.8h, v16.16b, v7.16b
+    ushr        v17.2d, v17.2d, 8
+    mov         v17.b[7], w12
+    lsr         x12, x12, 8
+    mov         v17.b[15], w13
+    lsr         x13, x13, 8
+    umlal       v22.8h, v18.8b, v4.8b
+    umlsl       v22.8h, v19.8b, v5.8b
+    umlal       v22.8h, v16.8b, v6.8b
+    umlsl       v22.8h, v17.8b, v7.8b
+    umlal2      v26.8h, v18.16b, v4.16b
+    umlsl2      v26.8h, v19.16b, v5.16b
+    umlal2      v26.8h, v16.16b, v6.16b
+    umlsl2      v26.8h, v17.16b, v7.16b
+    ushr        v18.2d, v18.2d, 8
+    mov         v18.b[7], w12
+    mov         v18.b[15], w13
+    umlal       v23.8h, v19.8b, v4.8b
+    umlsl       v23.8h, v16.8b, v5.8b
+    umlal       v23.8h, v17.8b, v6.8b
+    umlsl       v23.8h, v18.8b, v7.8b
+    umlal2      v27.8h, v19.16b, v4.16b
+    umlsl2      v27.8h, v16.16b, v5.16b
+    umlal2      v27.8h, v17.16b, v6.16b
+    umlsl2      v27.8h, v18.16b, v7.16b
+    sqrshrun    v20.8b, v20.8h, 6
+    sqrshrun    v21.8b, v21.8h, 6
+    sqrshrun    v22.8b, v22.8h, 6
+    sqrshrun    v23.8b, v23.8h, 6
+    sqrshrun2   v20.16b, v24.8h, 6
+    sqrshrun2   v21.16b, v25.8h, 6
+    sqrshrun2   v22.16b, v26.8h, 6
+    sqrshrun2   v23.16b, v27.8h, 6
+    st4         { v20.16b, v21.16b, v22.16b, v23.16b }, [x0], x1
+    add         x2, x2, x3
+    subs        x4, x4, 1
+    b.ne        1b
+    ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_qpel_uni_v4_8_neon, export=1
+    load_qpel_filterb x6, x5
+    sub         x2, x2, x3, lsl 1
+    sub         x2, x2, x3
+    ld1         { v16.s }[0], [x2], x3
+    ld1         { v17.s }[0], [x2], x3
+    ld1         { v18.s }[0], [x2], x3
+    ld1         { v19.s }[0], [x2], x3
+    ld1         { v20.s }[0], [x2], x3
+    ld1         { v21.s }[0], [x2], x3
+    ld1         { v22.s }[0], [x2], x3
+1:  ld1         { v23.s }[0], [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v16, v17, v18, v19, v20, v21, v22, v23
+    sqrshrun    v24.8b, v24.8h, 6
+    st1         { v24.s }[0], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v16.s }[0], [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v17, v18, v19, v20, v21, v22, v23, v16
+    sqrshrun    v24.8b, v24.8h, 6
+    st1         { v24.s }[0], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v17.s }[0], [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v18, v19, v20, v21, v22, v23, v16, v17
+    sqrshrun    v24.8b, v24.8h, 6
+    st1         { v24.s }[0], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v18.s }[0], [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v19, v20, v21, v22, v23, v16, v17, v18
+    sqrshrun    v24.8b, v24.8h, 6
+    st1         { v24.s }[0], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v19.s }[0], [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v20, v21, v22, v23, v16, v17, v18, v19
+    sqrshrun    v24.8b, v24.8h, 6
+    st1         { v24.s }[0], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v20.s }[0], [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v21, v22, v23, v16, v17, v18, v19, v20
+    sqrshrun    v24.8b, v24.8h, 6
+    st1         { v24.s }[0], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v21.s }[0], [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v22, v23, v16, v17, v18, v19, v20, v21
+    sqrshrun    v24.8b, v24.8h, 6
+    st1         { v24.s }[0], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v22.s }[0], [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v23, v16, v17, v18, v19, v20, v21, v22
+    sqrshrun    v24.8b, v24.8h, 6
+    st1         { v24.s }[0], [x0], x1
+    subs        x4, x4, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v6_8_neon, export=1
+    load_qpel_filterb x6, x5
+    sub         x2, x2, x3, lsl 1
+    sub         x2, x2, x3
+    sub         x1, x1, 4
+    ld1         { v16.8b }, [x2], x3
+    ld1         { v17.8b }, [x2], x3
+    ld1         { v18.8b }, [x2], x3
+    ld1         { v19.8b }, [x2], x3
+    ld1         { v20.8b }, [x2], x3
+    ld1         { v21.8b }, [x2], x3
+    ld1         { v22.8b }, [x2], x3
+1:  ld1         { v23.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v16, v17, v18, v19, v20, v21, v22, v23
+    sqrshrun    v24.8b, v24.8h, 6
+    st1         { v24.s }[0], [x0], 4
+    st1         { v24.h }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v16.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v17, v18, v19, v20, v21, v22, v23, v16
+    sqrshrun    v24.8b, v24.8h, 6
+    st1         { v24.s }[0], [x0], 4
+    st1         { v24.h }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v17.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v18, v19, v20, v21, v22, v23, v16, v17
+    sqrshrun    v24.8b, v24.8h, 6
+    st1         { v24.s }[0], [x0], 4
+    st1         { v24.h }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v18.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v19, v20, v21, v22, v23, v16, v17, v18
+    sqrshrun    v24.8b, v24.8h, 6
+    st1         { v24.s }[0], [x0], 4
+    st1         { v24.h }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v19.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v20, v21, v22, v23, v16, v17, v18, v19
+    sqrshrun    v24.8b, v24.8h, 6
+    st1         { v24.s }[0], [x0], 4
+    st1         { v24.h }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v20.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v21, v22, v23, v16, v17, v18, v19, v20
+    sqrshrun    v24.8b, v24.8h, 6
+    st1         { v24.s }[0], [x0], 4
+    st1         { v24.h }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v21.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v22, v23, v16, v17, v18, v19, v20, v21
+    sqrshrun    v24.8b, v24.8h, 6
+    st1         { v24.s }[0], [x0], 4
+    st1         { v24.h }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v22.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v23, v16, v17, v18, v19, v20, v21, v22
+    sqrshrun    v24.8b, v24.8h, 6
+    st1         { v24.s }[0], [x0], 4
+    st1         { v24.h }[2], [x0], x1
+    subs        x4, x4, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v8_8_neon, export=1
+    load_qpel_filterb x6, x5
+    sub         x2, x2, x3, lsl 1
+    sub         x2, x2, x3
+    ld1         { v16.8b }, [x2], x3
+    ld1         { v17.8b }, [x2], x3
+    ld1         { v18.8b }, [x2], x3
+    ld1         { v19.8b }, [x2], x3
+    ld1         { v20.8b }, [x2], x3
+    ld1         { v21.8b }, [x2], x3
+    ld1         { v22.8b }, [x2], x3
+1:  ld1         { v23.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v16, v17, v18, v19, v20, v21, v22, v23
+    sqrshrun    v24.8b, v24.8h, 6
+    st1         { v24.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v16.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v17, v18, v19, v20, v21, v22, v23, v16
+    sqrshrun    v24.8b, v24.8h, 6
+    st1         { v24.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v17.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v18, v19, v20, v21, v22, v23, v16, v17
+    sqrshrun    v24.8b, v24.8h, 6
+    st1         { v24.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v18.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v19, v20, v21, v22, v23, v16, v17, v18
+    sqrshrun    v24.8b, v24.8h, 6
+    st1         { v24.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v19.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v20, v21, v22, v23, v16, v17, v18, v19
+    sqrshrun    v24.8b, v24.8h, 6
+    st1         { v24.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v20.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v21, v22, v23, v16, v17, v18, v19, v20
+    sqrshrun    v24.8b, v24.8h, 6
+    st1         { v24.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v21.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v22, v23, v16, v17, v18, v19, v20, v21
+    sqrshrun    v24.8b, v24.8h, 6
+    st1         { v24.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v22.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v23, v16, v17, v18, v19, v20, v21, v22
+    sqrshrun    v24.8b, v24.8h, 6
+    st1         { v24.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v12_8_neon, export=1
+    load_qpel_filterb x6, x5
+    sub         x2, x2, x3, lsl 1
+    sub         x2, x2, x3
+    sub         x1, x1, 8
+1:  mov         x11, x4     // height
+    mov         x10, x0     // dst
+    mov         x8, x2      // src
+
+    ld1         { v16.16b }, [x8], x3
+    ld1         { v17.16b }, [x8], x3
+    ld1         { v18.16b }, [x8], x3
+    ld1         { v19.16b }, [x8], x3
+    ld1         { v20.16b }, [x8], x3
+    ld1         { v21.16b }, [x8], x3
+    ld1         { v22.16b }, [x8], x3
+2:  ld1         { v23.16b }, [x8], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v16, v17, v18, v19, v20, v21, v22, v23
+    calc_qpelb2 v25, v16, v17, v18, v19, v20, v21, v22, v23
+    sqrshrun    v24.8b, v24.8h, 6
+    sqrshrun2   v24.16b, v25.8h, 6
+    st1         { v24.8b }, [x10], 8
+    st1         { v24.s }[2], [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v16.16b }, [x8], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v17, v18, v19, v20, v21, v22, v23, v16
+    calc_qpelb2 v25, v17, v18, v19, v20, v21, v22, v23, v16
+    sqrshrun    v24.8b, v24.8h, 6
+    sqrshrun2   v24.16b, v25.8h, 6
+    st1         { v24.8b }, [x10], 8
+    st1         { v24.s }[2], [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v17.16b }, [x8], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v18, v19, v20, v21, v22, v23, v16, v17
+    calc_qpelb2 v25, v18, v19, v20, v21, v22, v23, v16, v17
+    sqrshrun    v24.8b, v24.8h, 6
+    sqrshrun2   v24.16b, v25.8h, 6
+    st1         { v24.8b }, [x10], 8
+    st1         { v24.s }[2], [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v18.16b }, [x8], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v19, v20, v21, v22, v23, v16, v17, v18
+    calc_qpelb2 v25, v19, v20, v21, v22, v23, v16, v17, v18
+    sqrshrun    v24.8b, v24.8h, 6
+    sqrshrun2   v24.16b, v25.8h, 6
+    st1         { v24.8b }, [x10], 8
+    st1         { v24.s }[2], [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v19.16b }, [x8], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v20, v21, v22, v23, v16, v17, v18, v19
+    calc_qpelb2 v25, v20, v21, v22, v23, v16, v17, v18, v19
+    sqrshrun    v24.8b, v24.8h, 6
+    sqrshrun2   v24.16b, v25.8h, 6
+    st1         { v24.8b }, [x10], 8
+    st1         { v24.s }[2], [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v20.16b }, [x8], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v21, v22, v23, v16, v17, v18, v19, v20
+    calc_qpelb2 v25, v21, v22, v23, v16, v17, v18, v19, v20
+    sqrshrun    v24.8b, v24.8h, 6
+    sqrshrun2   v24.16b, v25.8h, 6
+    st1         { v24.8b }, [x10], 8
+    st1         { v24.s }[2], [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v21.16b }, [x8], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v22, v23, v16, v17, v18, v19, v20, v21
+    calc_qpelb2 v25, v22, v23, v16, v17, v18, v19, v20, v21
+    sqrshrun    v24.8b, v24.8h, 6
+    sqrshrun2   v24.16b, v25.8h, 6
+    st1         { v24.8b }, [x10], 8
+    st1         { v24.s }[2], [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v22.16b }, [x8], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v23, v16, v17, v18, v19, v20, v21, v22
+    calc_qpelb2 v25, v23, v16, v17, v18, v19, v20, v21, v22
+    sqrshrun    v24.8b, v24.8h, 6
+    sqrshrun2   v24.16b, v25.8h, 6
+    st1         { v24.8b }, [x10], 8
+    st1         { v24.s }[2], [x10], x1
+    subs        x11, x11, 1
+    b.hi        2b
+
+3:  add         x0, x0, 12
+    add         x2, x2, 12
+    subs        x7, x7, 12
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v16_8_neon, export=1
+    load_qpel_filterb x6, x5
+    sub         x2, x2, x3, lsl 1
+    sub         x2, x2, x3
+1:  mov         x11, x4     // height
+    mov         x10, x0     // dst
+    mov         x8, x2      // src
+
+    ld1         { v16.16b }, [x8], x3
+    ld1         { v17.16b }, [x8], x3
+    ld1         { v18.16b }, [x8], x3
+    ld1         { v19.16b }, [x8], x3
+    ld1         { v20.16b }, [x8], x3
+    ld1         { v21.16b }, [x8], x3
+    ld1         { v22.16b }, [x8], x3
+2:  ld1         { v23.16b }, [x8], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v16, v17, v18, v19, v20, v21, v22, v23
+    calc_qpelb2 v25, v16, v17, v18, v19, v20, v21, v22, v23
+    sqrshrun    v24.8b, v24.8h, 6
+    sqrshrun2   v24.16b, v25.8h, 6
+    st1         { v24.16b }, [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v16.16b }, [x8], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v17, v18, v19, v20, v21, v22, v23, v16
+    calc_qpelb2 v25, v17, v18, v19, v20, v21, v22, v23, v16
+    sqrshrun    v24.8b, v24.8h, 6
+    sqrshrun2   v24.16b, v25.8h, 6
+    st1         { v24.16b }, [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v17.16b }, [x8], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v18, v19, v20, v21, v22, v23, v16, v17
+    calc_qpelb2 v25, v18, v19, v20, v21, v22, v23, v16, v17
+    sqrshrun    v24.8b, v24.8h, 6
+    sqrshrun2   v24.16b, v25.8h, 6
+    st1         { v24.16b }, [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v18.16b }, [x8], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v19, v20, v21, v22, v23, v16, v17, v18
+    calc_qpelb2 v25, v19, v20, v21, v22, v23, v16, v17, v18
+    sqrshrun    v24.8b, v24.8h, 6
+    sqrshrun2   v24.16b, v25.8h, 6
+    st1         { v24.16b }, [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v19.16b }, [x8], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v20, v21, v22, v23, v16, v17, v18, v19
+    calc_qpelb2 v25, v20, v21, v22, v23, v16, v17, v18, v19
+    sqrshrun    v24.8b, v24.8h, 6
+    sqrshrun2   v24.16b, v25.8h, 6
+    st1         { v24.16b }, [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v20.16b }, [x8], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v21, v22, v23, v16, v17, v18, v19, v20
+    calc_qpelb2 v25, v21, v22, v23, v16, v17, v18, v19, v20
+    sqrshrun    v24.8b, v24.8h, 6
+    sqrshrun2   v24.16b, v25.8h, 6
+    st1         { v24.16b }, [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v21.16b }, [x8], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v22, v23, v16, v17, v18, v19, v20, v21
+    calc_qpelb2 v25, v22, v23, v16, v17, v18, v19, v20, v21
+    sqrshrun    v24.8b, v24.8h, 6
+    sqrshrun2   v24.16b, v25.8h, 6
+    st1         { v24.16b }, [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v22.16b }, [x8], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v23, v16, v17, v18, v19, v20, v21, v22
+    calc_qpelb2 v25, v23, v16, v17, v18, v19, v20, v21, v22
+    sqrshrun    v24.8b, v24.8h, 6
+    sqrshrun2   v24.16b, v25.8h, 6
+    st1         { v24.16b }, [x10], x1
+    subs        x11, x11, 1
+    b.hi        2b
+
+3:  add         x0, x0, 16
+    add         x2, x2, 16
+    subs        x7, x7, 16
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v24_8_neon, export=1
+    b ff_hevc_put_hevc_qpel_uni_v12_8_neon
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v32_8_neon, export=1
+    b ff_hevc_put_hevc_qpel_uni_v16_8_neon
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v48_8_neon, export=1
+    b ff_hevc_put_hevc_qpel_uni_v16_8_neon
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v64_8_neon, export=1
+    b ff_hevc_put_hevc_qpel_uni_v16_8_neon
+endfunc
+
+
+
+function ff_hevc_put_hevc_qpel_uni_hv4_8_neon, export=1
+    add         x10, x4, 7
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x6, [sp, -16]!
+    stp         x30, xzr, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3, lsl 1
+    sub         x1, x1, x3
+    mov         x2, x3
+    add         x3, x4, 7
+    mov         x4, x5
+    bl          ff_hevc_put_hevc_qpel_h4_8_neon
+    ldp         x30, xzr, [sp], 16
+    ldp         x4, x6, [sp], 16
+    ldp         x0, x1, [sp], 16
+    load_qpel_filterh x6, x5
+    mov         x9, 128
+    ld1         { v16.4h }, [sp], x9
+    ld1         { v17.4h }, [sp], x9
+    ld1         { v18.4h }, [sp], x9
+    ld1         { v19.4h }, [sp], x9
+    ld1         { v20.4h }, [sp], x9
+    ld1         { v21.4h }, [sp], x9
+    ld1         { v22.4h }, [sp], x9
+1:  ld1         { v23.4h }, [sp], x9
+    calc_qpelh  v1, v16, v17, v18, v19, v20, v21, v22, v23, sqrshrn, 12
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v16.8h }, [sp], x9
+    calc_qpelh  v1, v17, v18, v19, v20, v21, v22, v23, v16, sqrshrn, 12
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v17.8h }, [sp], x9
+    calc_qpelh  v1, v18, v19, v20, v21, v22, v23, v16, v17, sqrshrn, 12
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v18.8h }, [sp], x9
+    calc_qpelh  v1, v19, v20, v21, v22, v23, v16, v17, v18, sqrshrn, 12
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v19.8h }, [sp], x9
+    calc_qpelh  v1, v20, v21, v22, v23, v16, v17, v18, v19, sqrshrn, 12
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v20.8h }, [sp], x9
+    calc_qpelh  v1, v21, v22, v23, v16, v17, v18, v19, v20, sqrshrn, 12
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v21.8h }, [sp], x9
+    calc_qpelh  v1, v22, v23, v16, v17, v18, v19, v20, v21, sqrshrn, 12
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v22.8h }, [sp], x9
+    calc_qpelh  v1, v23, v16, v17, v18, v19, v20, v21, v22, sqrshrn, 12
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], x1
+    subs        x4, x4, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv6_8_neon, export=1
+    add         x10, x4, 7
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x6, [sp, -16]!
+    stp         x30, xzr, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3, lsl 1
+    sub         x1, x1, x3
+    mov         x2, x3
+    add         x3, x4, 7
+    mov         x4, x5
+    bl          ff_hevc_put_hevc_qpel_h6_8_neon
+    ldp         x30, xzr, [sp], 16
+    ldp         x4, x6, [sp], 16
+    ldp         x0, x1, [sp], 16
+    load_qpel_filterh x6, x5
+    sub         x1, x1, 4
+    mov         x9, 128
+    ld1         { v16.8h }, [sp], x9
+    ld1         { v17.8h }, [sp], x9
+    ld1         { v18.8h }, [sp], x9
+    ld1         { v19.8h }, [sp], x9
+    ld1         { v20.8h }, [sp], x9
+    ld1         { v21.8h }, [sp], x9
+    ld1         { v22.8h }, [sp], x9
+1:  ld1         { v23.8h }, [sp], x9
+    calc_qpelh  v1, v16, v17, v18, v19, v20, v21, v22, v23, sqrshrn, 12
+    calc_qpelh2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, sqrshrn2, 12
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], 4
+    st1         { v1.h }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v16.8h }, [sp], x9
+    calc_qpelh  v1, v17, v18, v19, v20, v21, v22, v23, v16, sqrshrn, 12
+    calc_qpelh2 v1, v2, v17, v18, v19, v20, v21, v22, v23, v16, sqrshrn2, 12
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], 4
+    st1         { v1.h }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v17.8h }, [sp], x9
+    calc_qpelh  v1, v18, v19, v20, v21, v22, v23, v16, v17, sqrshrn, 12
+    calc_qpelh2 v1, v2, v18, v19, v20, v21, v22, v23, v16, v17, sqrshrn2, 12
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], 4
+    st1         { v1.h }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v18.8h }, [sp], x9
+    calc_qpelh  v1, v19, v20, v21, v22, v23, v16, v17, v18, sqrshrn, 12
+    calc_qpelh2 v1, v2, v19, v20, v21, v22, v23, v16, v17, v18, sqrshrn2, 12
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], 4
+    st1         { v1.h }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v19.8h }, [sp], x9
+    calc_qpelh  v1, v20, v21, v22, v23, v16, v17, v18, v19, sqrshrn, 12
+    calc_qpelh2 v1, v2, v20, v21, v22, v23, v16, v17, v18, v19, sqrshrn2, 12
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], 4
+    st1         { v1.h }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v20.8h }, [sp], x9
+    calc_qpelh  v1, v21, v22, v23, v16, v17, v18, v19, v20, sqrshrn, 12
+    calc_qpelh2 v1, v2, v21, v22, v23, v16, v17, v18, v19, v20, sqrshrn2, 12
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], 4
+    st1         { v1.h }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v21.8h }, [sp], x9
+    calc_qpelh  v1, v22, v23, v16, v17, v18, v19, v20, v21, sqrshrn, 12
+    calc_qpelh2 v1, v2, v22, v23, v16, v17, v18, v19, v20, v21, sqrshrn2, 12
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], 4
+    st1         { v1.h }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v22.8h }, [sp], x9
+    calc_qpelh  v1, v23, v16, v17, v18, v19, v20, v21, v22, sqrshrn, 12
+    calc_qpelh2 v1, v2, v23, v16, v17, v18, v19, v20, v21, v22, sqrshrn2, 12
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], 4
+    st1         { v1.h }[2], [x0], x1
+    subs        x4, x4, 1
+    b.hi        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv8_8_neon, export=1
+    add         x10, x4, 7
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x6, [sp, -16]!
+    stp         x30, xzr, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3, lsl 1
+    sub         x1, x1, x3
+    mov         x2, x3
+    add         x3, x4, 7
+    mov         x4, x5
+    bl          ff_hevc_put_hevc_qpel_h8_8_neon
+    ldp         x30, xzr, [sp], 16
+    ldp         x4, x6, [sp], 16
+    ldp         x0, x1, [sp], 16
+    load_qpel_filterh x6, x5
+    mov         x9, 128
+    ld1         { v16.8h }, [sp], x9
+    ld1         { v17.8h }, [sp], x9
+    ld1         { v18.8h }, [sp], x9
+    ld1         { v19.8h }, [sp], x9
+    ld1         { v20.8h }, [sp], x9
+    ld1         { v21.8h }, [sp], x9
+    ld1         { v22.8h }, [sp], x9
+1:  ld1         { v23.8h }, [sp], x9
+    calc_qpelh  v1, v16, v17, v18, v19, v20, v21, v22, v23, sqrshrn, 12
+    calc_qpelh2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, sqrshrn2, 12
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v16.8h }, [sp], x9
+    calc_qpelh  v1, v17, v18, v19, v20, v21, v22, v23, v16, sqrshrn, 12
+    calc_qpelh2 v1, v2, v17, v18, v19, v20, v21, v22, v23, v16, sqrshrn2, 12
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v17.8h }, [sp], x9
+    calc_qpelh  v1, v18, v19, v20, v21, v22, v23, v16, v17, sqrshrn, 12
+    calc_qpelh2 v1, v2, v18, v19, v20, v21, v22, v23, v16, v17, sqrshrn2, 12
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v18.8h }, [sp], x9
+    calc_qpelh  v1, v19, v20, v21, v22, v23, v16, v17, v18, sqrshrn, 12
+    calc_qpelh2 v1, v2, v19, v20, v21, v22, v23, v16, v17, v18, sqrshrn2, 12
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v19.8h }, [sp], x9
+    calc_qpelh  v1, v20, v21, v22, v23, v16, v17, v18, v19, sqrshrn, 12
+    calc_qpelh2 v1, v2, v20, v21, v22, v23, v16, v17, v18, v19, sqrshrn2, 12
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v20.8h }, [sp], x9
+    calc_qpelh  v1, v21, v22, v23, v16, v17, v18, v19, v20, sqrshrn, 12
+    calc_qpelh2 v1, v2, v21, v22, v23, v16, v17, v18, v19, v20, sqrshrn2, 12
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v21.8h }, [sp], x9
+    calc_qpelh  v1, v22, v23, v16, v17, v18, v19, v20, v21, sqrshrn, 12
+    calc_qpelh2 v1, v2, v22, v23, v16, v17, v18, v19, v20, v21, sqrshrn2, 12
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v22.8h }, [sp], x9
+    calc_qpelh  v1, v23, v16, v17, v18, v19, v20, v21, v22, sqrshrn, 12
+    calc_qpelh2 v1, v2, v23, v16, v17, v18, v19, v20, v21, v22, sqrshrn2, 12
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.8b }, [x0], x1
+    subs        x4, x4, 1
+    b.hi        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv12_8_neon, export=1
+    add         x10, x4, 7
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x6, [sp, -16]!
+    stp         x7, x30, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3, lsl 1
+    sub         x1, x1, x3
+    mov         x2, x3
+    add         x3, x4, 7
+    mov         x4, x5
+    bl          ff_hevc_put_hevc_qpel_h12_8_neon
+    ldp         x7, x30, [sp], 16
+    ldp         x4, x6, [sp], 16
+    ldp         x0, x1, [sp], 16
+    load_qpel_filterh x6, x5
+    sub         x1, x1, 8
+    mov         x9, 128
+
+    ld1         { v16.8h, v17.8h }, [sp], x9
+    ld1         { v18.8h, v19.8h }, [sp], x9
+    ld1         { v20.8h, v21.8h }, [sp], x9
+    ld1         { v22.8h, v23.8h }, [sp], x9
+    ld1         { v24.8h, v25.8h }, [sp], x9
+    ld1         { v26.8h, v27.8h }, [sp], x9
+    ld1         { v28.8h, v29.8h }, [sp], x9
+1:  ld1         { v30.8h, v31.8h }, [sp], x9
+    calc_qpelh  v1, v16, v18, v20, v22, v24, v26, v28, v30, sqrshrn, 12
+    calc_qpelh2 v1, v2, v16, v18, v20, v22, v24, v26, v28, v30, sqrshrn2, 12
+    calc_qpelh  v2, v17, v19, v21, v23, v25, v27, v29, v31, sqrshrn, 12
+    sqxtun      v1.8b, v1.8h
+    sqxtun2     v1.16b, v2.8h
+    st1         { v1.8b }, [x0], 8
+    st1         { v1.s }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v16.8h, v17.8h }, [sp], x9
+    calc_qpelh  v1, v18, v20, v22, v24, v26, v28, v30, v16, sqrshrn, 12
+    calc_qpelh2 v1, v2, v18, v20, v22, v24, v26, v28, v30, v16, sqrshrn2, 12
+    calc_qpelh  v2, v19, v21, v23, v25, v27, v29, v31, v17, sqrshrn, 12
+    sqxtun      v1.8b, v1.8h
+    sqxtun2     v1.16b, v2.8h
+    st1         { v1.8b }, [x0], 8
+    st1         { v1.s }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v18.8h, v19.8h }, [sp], x9
+    calc_qpelh  v1, v20, v22, v24, v26, v28, v30, v16, v18, sqrshrn, 12
+    calc_qpelh2 v1, v2, v20, v22, v24, v26, v28, v30, v16, v18, sqrshrn2, 12
+    calc_qpelh  v2, v21, v23, v25, v27, v29, v31, v17, v19, sqrshrn, 12
+    sqxtun      v1.8b, v1.8h
+    sqxtun2     v1.16b, v2.8h
+    st1         { v1.8b }, [x0], 8
+    st1         { v1.s }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v20.8h, v21.8h }, [sp], x9
+    calc_qpelh  v1, v22, v24, v26, v28, v30, v16, v18, v20, sqrshrn, 12
+    calc_qpelh2 v1, v2, v22, v24, v26, v28, v30, v16, v18, v20, sqrshrn2, 12
+    calc_qpelh  v2, v23, v25, v27, v29, v31, v17, v19, v21, sqrshrn, 12
+    sqxtun      v1.8b, v1.8h
+    sqxtun2     v1.16b, v2.8h
+    st1         { v1.8b }, [x0], 8
+    st1         { v1.s }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v22.8h, v23.8h }, [sp], x9
+    calc_qpelh  v1, v24, v26, v28, v30, v16, v18, v20, v22, sqrshrn, 12
+    calc_qpelh2 v1, v2, v24, v26, v28, v30, v16, v18, v20, v22, sqrshrn2, 12
+    calc_qpelh  v2, v25, v27, v29, v31, v17, v19, v21, v23, sqrshrn, 12
+    sqxtun      v1.8b, v1.8h
+    sqxtun2     v1.16b, v2.8h
+    st1         { v1.8b }, [x0], 8
+    st1         { v1.s }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v24.8h, v25.8h }, [sp], x9
+    calc_qpelh  v1, v26, v28, v30, v16, v18, v20, v22, v24, sqrshrn, 12
+    calc_qpelh2 v1, v2, v26, v28, v30, v16, v18, v20, v22, v24, sqrshrn2, 12
+    calc_qpelh  v2, v27, v29, v31, v17, v19, v21, v23, v25, sqrshrn, 12
+    sqxtun      v1.8b, v1.8h
+    sqxtun2     v1.16b, v2.8h
+    st1         { v1.8b }, [x0], 8
+    st1         { v1.s }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v26.8h, v27.8h }, [sp], x9
+    calc_qpelh  v1, v28, v30, v16, v18, v20, v22, v24, v26, sqrshrn, 12
+    calc_qpelh2 v1, v2, v28, v30, v16, v18, v20, v22, v24, v26, sqrshrn2, 12
+    calc_qpelh  v2, v29, v31, v17, v19, v21, v23, v25, v27, sqrshrn, 12
+    sqxtun      v1.8b, v1.8h
+    sqxtun2     v1.16b, v2.8h
+    st1         { v1.8b }, [x0], 8
+    st1         { v1.s }[2], [x0], x1
+    subs        x4, x4, 1
+    b.eq        2f
+
+    ld1         { v28.8h, v29.8h }, [sp], x9
+    calc_qpelh  v1, v30, v16, v18, v20, v22, v24, v26, v28, sqrshrn, 12
+    calc_qpelh2 v1, v2, v30, v16, v18, v20, v22, v24, v26, v28, sqrshrn2, 12
+    calc_qpelh  v2, v31, v17, v19, v21, v23, v25, v27, v29, sqrshrn, 12
+    sqxtun      v1.8b, v1.8h
+    sqxtun2     v1.16b, v2.8h
+    st1         { v1.8b }, [x0], 8
+    st1         { v1.s }[2], [x0], x1
+    subs        x4, x4, 1
+    b.ne        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv16_8_neon, export=1
+    add         x10, x4, 7
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x6, [sp, -16]!
+    stp         x7, x30, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3, lsl 1
+    sub         x1, x1, x3
+    mov         x2, x3
+    add         x3, x4, 7
+    mov         x4, x5
+    bl          ff_hevc_put_hevc_qpel_h16_8_neon
+    ldp         x7, x30, [sp], 16
+    ldp         x4, x6, [sp], 16
+    ldp         x0, x1, [sp], 16
+.Lqpel_uni_hv16_loop:
+    load_qpel_filterh x6, x5
+    mov         x9, 128
+    sub         x12, x9, x7, lsl 1
+1:  mov         x11, x4     // height
+    mov         x10, x0     // dst
+    mov         x8, sp      // src
+
+    ld1         { v16.8h, v17.8h }, [x8], x9
+    ld1         { v18.8h, v19.8h }, [x8], x9
+    ld1         { v20.8h, v21.8h }, [x8], x9
+    ld1         { v22.8h, v23.8h }, [x8], x9
+    ld1         { v24.8h, v25.8h }, [x8], x9
+    ld1         { v26.8h, v27.8h }, [x8], x9
+    ld1         { v28.8h, v29.8h }, [x8], x9
+2:  ld1         { v30.8h, v31.8h }, [x8], x9
+    calc_qpelh  v1, v16, v18, v20, v22, v24, v26, v28, v30, sqrshrn, 12
+    calc_qpelh2 v1, v2, v16, v18, v20, v22, v24, v26, v28, v30, sqrshrn2, 12
+    calc_qpelh  v2, v17, v19, v21, v23, v25, v27, v29, v31, sqrshrn, 12
+    calc_qpelh2 v2, v3, v17, v19, v21, v23, v25, v27, v29, v31, sqrshrn2, 12
+    sqxtun      v1.8b, v1.8h
+    sqxtun2     v1.16b, v2.8h
+    st1         { v1.16b }, [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v16.8h, v17.8h }, [x8], x9
+    calc_qpelh  v1, v18, v20, v22, v24, v26, v28, v30, v16, sqrshrn, 12
+    calc_qpelh2 v1, v2, v18, v20, v22, v24, v26, v28, v30, v16, sqrshrn2, 12
+    calc_qpelh  v2, v19, v21, v23, v25, v27, v29, v31, v17, sqrshrn, 12
+    calc_qpelh2 v2, v3, v19, v21, v23, v25, v27, v29, v31, v17, sqrshrn2, 12
+    sqxtun      v1.8b, v1.8h
+    sqxtun2     v1.16b, v2.8h
+    st1         { v1.16b }, [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v18.8h, v19.8h }, [x8], x9
+    calc_qpelh  v1, v20, v22, v24, v26, v28, v30, v16, v18, sqrshrn, 12
+    calc_qpelh2 v1, v2, v20, v22, v24, v26, v28, v30, v16, v18, sqrshrn2, 12
+    calc_qpelh  v2, v21, v23, v25, v27, v29, v31, v17, v19, sqrshrn, 12
+    calc_qpelh2 v2, v3, v21, v23, v25, v27, v29, v31, v17, v19, sqrshrn2, 12
+    sqxtun      v1.8b, v1.8h
+    sqxtun2     v1.16b, v2.8h
+    st1         { v1.16b }, [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v20.8h, v21.8h }, [x8], x9
+    calc_qpelh  v1, v22, v24, v26, v28, v30, v16, v18, v20, sqrshrn, 12
+    calc_qpelh2 v1, v2, v22, v24, v26, v28, v30, v16, v18, v20, sqrshrn2, 12
+    calc_qpelh  v2, v23, v25, v27, v29, v31, v17, v19, v21, sqrshrn, 12
+    calc_qpelh2 v2, v3, v23, v25, v27, v29, v31, v17, v19, v21, sqrshrn2, 12
+    sqxtun      v1.8b, v1.8h
+    sqxtun2     v1.16b, v2.8h
+    st1         { v1.16b }, [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v22.8h, v23.8h }, [x8], x9
+    calc_qpelh  v1, v24, v26, v28, v30, v16, v18, v20, v22, sqrshrn, 12
+    calc_qpelh2 v1, v2, v24, v26, v28, v30, v16, v18, v20, v22, sqrshrn2, 12
+    calc_qpelh  v2, v25, v27, v29, v31, v17, v19, v21, v23, sqrshrn, 12
+    calc_qpelh2 v2, v3, v25, v27, v29, v31, v17, v19, v21, v23, sqrshrn2, 12
+    sqxtun      v1.8b, v1.8h
+    sqxtun2     v1.16b, v2.8h
+    st1         { v1.16b }, [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v24.8h, v25.8h }, [x8], x9
+    calc_qpelh  v1, v26, v28, v30, v16, v18, v20, v22, v24, sqrshrn, 12
+    calc_qpelh2 v1, v2, v26, v28, v30, v16, v18, v20, v22, v24, sqrshrn2, 12
+    calc_qpelh  v2, v27, v29, v31, v17, v19, v21, v23, v25, sqrshrn, 12
+    calc_qpelh2 v2, v3, v27, v29, v31, v17, v19, v21, v23, v25, sqrshrn2, 12
+    sqxtun      v1.8b, v1.8h
+    sqxtun2     v1.16b, v2.8h
+    st1         { v1.16b }, [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v26.8h, v27.8h }, [x8], x9
+    calc_qpelh  v1, v28, v30, v16, v18, v20, v22, v24, v26, sqrshrn, 12
+    calc_qpelh2 v1, v2, v28, v30, v16, v18, v20, v22, v24, v26, sqrshrn2, 12
+    calc_qpelh  v2, v29, v31, v17, v19, v21, v23, v25, v27, sqrshrn, 12
+    calc_qpelh2 v2, v3, v29, v31, v17, v19, v21, v23, v25, v27, sqrshrn2, 12
+    sqxtun      v1.8b, v1.8h
+    sqxtun2     v1.16b, v2.8h
+    st1         { v1.16b }, [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v28.8h, v29.8h }, [x8], x9
+    calc_qpelh  v1, v30, v16, v18, v20, v22, v24, v26, v28, sqrshrn, 12
+    calc_qpelh2 v1, v2, v30, v16, v18, v20, v22, v24, v26, v28, sqrshrn2, 12
+    calc_qpelh  v2, v31, v17, v19, v21, v23, v25, v27, v29, sqrshrn, 12
+    calc_qpelh2 v2, v3, v31, v17, v19, v21, v23, v25, v27, v29, sqrshrn2, 12
+    sqxtun      v1.8b, v1.8h
+    sqxtun2     v1.16b, v2.8h
+    st1         { v1.16b }, [x10], x1
+    subs        x11, x11, 1
+    b.hi        2b
+
+3:  add         x0, x0, 16
+    add         sp, sp, 32
+    subs        x7, x7, 16
+    b.ne        1b
+    add         sp, sp, x12     // discard rest of first line
+    add         x10, x4, 6
+    lsl         x10, x10, 7
+    add         sp, sp, x10     // tmp_array without first line
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv24_8_neon, export=1
+    stp         x6, x30, [sp, -16]!
+    stp         x0, x1, [sp, -16]!
+    stp         x2, x3, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    mov         x7, 16
+    bl          ff_hevc_put_hevc_qpel_uni_hv16_8_neon
+    ldp         x4, x5, [sp], 16
+    ldp         x2, x3, [sp], 16
+    ldp         x0, x1, [sp], 16
+    ldr         x6, [sp]
+    add         x0, x0, 16
+    add         x2, x2, 16
+    mov         x7, 8
+    bl          ff_hevc_put_hevc_qpel_uni_hv8_8_neon
+    ldp         xzr, x30, [sp], 16
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv32_8_neon, export=1
+    add         x10, x4, 7
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x6, [sp, -16]!
+    stp         x7, x30, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3, lsl 1
+    sub         x1, x1, x3
+    mov         x2, x3
+    add         x3, x4, 7
+    mov         x4, x5
+    bl          ff_hevc_put_hevc_qpel_h32_8_neon
+    ldp         x7, x30, [sp], 16
+    ldp         x4, x6, [sp], 16
+    ldp         x0, x1, [sp], 16
+    b .Lqpel_uni_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv48_8_neon, export=1
+    add         x10, x4, 7
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x6, [sp, -16]!
+    stp         x7, x30, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3, lsl 1
+    sub         x1, x1, x3
+    mov         x2, x3
+    add         x3, x4, 7
+    mov         x4, x5
+    bl          ff_hevc_put_hevc_qpel_h48_8_neon
+    ldp         x7, x30, [sp], 16
+    ldp         x4, x6, [sp], 16
+    ldp         x0, x1, [sp], 16
+    b .Lqpel_uni_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv64_8_neon, export=1
+    add         x10, x4, 7
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x6, [sp, -16]!
+    stp         x7, x30, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3, lsl 1
+    sub         x1, x1, x3
+    mov         x2, x3
+    add         x3, x4, 7
+    mov         x4, x5
+    bl          ff_hevc_put_hevc_qpel_h64_8_neon
+    ldp         x7, x30, [sp], 16
+    ldp         x4, x6, [sp], 16
+    ldp         x0, x1, [sp], 16
+    b .Lqpel_uni_hv16_loop
+endfunc
+
+
+
+
+function ff_hevc_put_hevc_qpel_bi_h4_8_neon, export=1
+    load_qpel_filterb x6, x7
+    sub         x2, x2, 3
+    mov         x10, 128
+1:  ld1         { v16.8b, v17.8b }, [x2], x3
+    movi        v20.8h, 0
+    umlsl       v20.8h, v16.8b, v0.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[0]
+    umlal       v20.8h, v16.8b, v1.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[1]
+    umlsl       v20.8h, v16.8b, v2.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[2]
+    umlal       v20.8h, v16.8b, v3.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[3]
+    umlal       v20.8h, v16.8b, v4.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[4]
+    umlsl       v20.8h, v16.8b, v5.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[5]
+    umlal       v20.8h, v16.8b, v6.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[6]
+    umlsl       v20.8h, v16.8b, v7.8b
+    ld1         { v24.8h }, [x4], x10
+    sqadd       v16.8h, v20.8h, v24.8h
+    sqrshrun    v16.8b, v16.8h, 7
+    st1         { v16.s }[0], [x0], x1
+    subs        x5, x5, 1
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h6_8_neon, export=1
+    load_qpel_filterb x6, x7
+    sub         x2, x2, 3
+    sub         x1, x1, 4
+    mov         x10, 128
+1:  ld1         { v16.8b, v17.8b }, [x2], x3
+    movi        v20.8h, 0
+    umlsl       v20.8h, v16.8b, v0.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[0]
+    umlal       v20.8h, v16.8b, v1.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[1]
+    umlsl       v20.8h, v16.8b, v2.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[2]
+    umlal       v20.8h, v16.8b, v3.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[3]
+    umlal       v20.8h, v16.8b, v4.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[4]
+    umlsl       v20.8h, v16.8b, v5.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[5]
+    umlal       v20.8h, v16.8b, v6.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[6]
+    umlsl       v20.8h, v16.8b, v7.8b
+    ld1         { v24.8h }, [x4], x10
+    sqadd       v16.8h, v20.8h, v24.8h
+    sqrshrun    v16.8b, v16.8h, 7
+    st1         { v16.s }[0], [x0], 4
+    st1         { v16.h }[2], [x0], x1
+    subs        x5, x5, 1
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h8_8_neon, export=1
+    load_qpel_filterb x6, x7
+    sub         x2, x2, 3
+    mov         x10, 128
+1:  ld1         { v16.8b, v17.8b }, [x2], x3
+    movi        v20.8h, 0
+    umlsl       v20.8h, v16.8b, v0.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[0]
+    umlal       v20.8h, v16.8b, v1.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[1]
+    umlsl       v20.8h, v16.8b, v2.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[2]
+    umlal       v20.8h, v16.8b, v3.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[3]
+    umlal       v20.8h, v16.8b, v4.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[4]
+    umlsl       v20.8h, v16.8b, v5.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[5]
+    umlal       v20.8h, v16.8b, v6.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], v17.b[6]
+    umlsl       v20.8h, v16.8b, v7.8b
+    ld1         { v24.8h }, [x4], x10
+    sqadd       v16.8h, v20.8h, v24.8h
+    sqrshrun    v16.8b, v16.8h, 7
+    st1         { v16.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h12_8_neon, export=1
+	load_qpel_filterb x6, x7
+	sub			x2, x2, 3
+	sub			x1, x1, 8
+	mov			x10, 128
+1:	ld2			{ v16.8b, v17.8b }, [x2]
+	ldr			w12, [x2, 16]
+	movi		v20.8h, 0
+	movi		v21.8h, 0
+	umlsl		v20.8h, v16.8b, v0.8b
+	umlal		v20.8h, v17.8b, v1.8b
+	ushr		v16.2d, v16.2d, 8
+	mov			v16.b[7], w12
+	lsr			x12, x12, 8
+	umlsl		v21.8h, v17.8b, v0.8b
+	umlal		v21.8h, v16.8b, v1.8b
+	ushr		v17.2d, v17.2d, 8
+	mov			v17.b[7], w12
+	lsr			x12, x12, 8
+	umlsl		v20.8h, v16.8b, v2.8b
+	umlal		v20.8h, v17.8b, v3.8b
+	ushr		v16.2d, v16.2d, 8
+	mov			v16.b[7], w12
+	umlsl		v21.8h, v17.8b, v2.8b
+	umlal		v21.8h, v16.8b, v3.8b
+	ushr		v17.2d, v17.2d, 8
+	umlal		v20.8h, v16.8b, v4.8b
+	umlsl		v20.8h, v17.8b, v5.8b
+	ushr		v16.2d, v16.2d, 8
+	umlal		v21.8h, v17.8b, v4.8b
+	umlsl		v21.8h, v16.8b, v5.8b
+	ushr		v17.2d, v17.2d, 8
+	umlal		v20.8h, v16.8b, v6.8b
+	umlsl		v20.8h, v17.8b, v7.8b
+	ushr		v16.2d, v16.2d, 8
+	umlal		v21.8h, v17.8b, v6.8b
+	umlsl		v21.8h, v16.8b, v7.8b
+	ld2			{ v24.8h, v25.8h }, [x4], x10
+	sqadd		v16.8h, v20.8h, v24.8h
+	sqadd		v17.8h, v21.8h, v25.8h
+	sqrshrun	v16.8b, v16.8h, 7
+	sqrshrun	v17.8b, v17.8h, 7
+	zip1		v16.16b, v16.16b, v17.16b
+	st1			{ v16.8b }, [x0], 8
+	st1			{ v16.s }[2], [x0], x1
+	add			x2, x2, x3
+	subs		x5, x5, 1
+	b.ne		1b
+	ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h16_8_neon, export=1
+    load_qpel_filterb x6, x7
+    sub         x2, x2, 3
+    mov         x10, 128
+1:  ld2         { v16.8b, v17.8b }, [x2]
+    ldr         x12, [x2, 16]
+    movi        v20.8h, 0
+    movi        v21.8h, 0
+    umlsl       v20.8h, v16.8b, v0.8b
+    umlal       v20.8h, v17.8b, v1.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    lsr         x12, x12, 8
+    umlsl       v21.8h, v17.8b, v0.8b
+    umlal       v21.8h, v16.8b, v1.8b
+    ushr        v17.2d, v17.2d, 8
+    mov         v17.b[7], w12
+    lsr         x12, x12, 8
+    umlsl       v20.8h, v16.8b, v2.8b
+    umlal       v20.8h, v17.8b, v3.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    lsr         x12, x12, 8
+    umlsl       v21.8h, v17.8b, v2.8b
+    umlal       v21.8h, v16.8b, v3.8b
+    ushr        v17.2d, v17.2d, 8
+    mov         v17.b[7], w12
+    lsr         x12, x12, 8
+    umlal       v20.8h, v16.8b, v4.8b
+    umlsl       v20.8h, v17.8b, v5.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    lsr         x12, x12, 8
+    umlal       v21.8h, v17.8b, v4.8b
+    umlsl       v21.8h, v16.8b, v5.8b
+    ushr        v17.2d, v17.2d, 8
+    mov         v17.b[7], w12
+    lsr         x12, x12, 8
+    umlal       v20.8h, v16.8b, v6.8b
+    umlsl       v20.8h, v17.8b, v7.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    umlal       v21.8h, v17.8b, v6.8b
+    umlsl       v21.8h, v16.8b, v7.8b
+    ld2         { v24.8h, v25.8h }, [x4], x10
+    sqadd       v16.8h, v20.8h, v24.8h
+    sqadd       v17.8h, v21.8h, v25.8h
+    sqrshrun    v16.8b, v16.8h, 7
+    sqrshrun    v17.8b, v17.8h, 7
+    st2         { v16.8b, v17.8b }, [x0], x1
+    add         x2, x2, x3
+    subs        x5, x5, 1
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h24_8_neon, export=1
+    load_qpel_filterb x6, x7
+    sub         x2, x2, 3
+    mov         x10, 128
+    mov         x11, x7     // height
+1:  ld3         { v16.8b, v17.8b, v18.8b }, [x2]
+    ldr         x12, [x2, 24]
+    movi        v20.8h, 0
+    movi        v21.8h, 0
+    movi        v22.8h, 0
+    umlsl       v20.8h, v16.8b, v0.8b
+    umlal       v20.8h, v17.8b, v1.8b
+    umlsl       v20.8h, v18.8b, v2.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    lsr         x12, x12, 8
+    umlsl       v21.8h, v17.8b, v0.8b
+    umlal       v21.8h, v18.8b, v1.8b
+    umlsl       v21.8h, v16.8b, v2.8b
+    ushr        v17.2d, v17.2d, 8
+    mov         v17.b[7], w12
+    lsr         x12, x12, 8
+    umlsl       v22.8h, v18.8b, v0.8b
+    umlal       v22.8h, v16.8b, v1.8b
+    umlsl       v22.8h, v17.8b, v2.8b
+    ushr        v18.2d, v18.2d, 8
+    mov         v18.b[7], w12
+    lsr         x12, x12, 8
+    umlal       v20.8h, v16.8b, v3.8b
+    umlal       v20.8h, v17.8b, v4.8b
+    umlsl       v20.8h, v18.8b, v5.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    lsr         x12, x12, 8
+    umlal       v21.8h, v17.8b, v3.8b
+    umlal       v21.8h, v18.8b, v4.8b
+    umlsl       v21.8h, v16.8b, v5.8b
+    ushr        v17.2d, v17.2d, 8
+    mov         v17.b[7], w12
+    lsr         x12, x12, 8
+    umlal       v22.8h, v18.8b, v3.8b
+    umlal       v22.8h, v16.8b, v4.8b
+    umlsl       v22.8h, v17.8b, v5.8b
+    ushr        v18.2d, v18.2d, 8
+    mov         v18.b[7], w12
+    lsr         x12, x12, 8
+    umlal       v20.8h, v16.8b, v6.8b
+    umlsl       v20.8h, v17.8b, v7.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    umlal       v21.8h, v17.8b, v6.8b
+    umlsl       v21.8h, v18.8b, v7.8b
+    umlal       v22.8h, v18.8b, v6.8b
+    umlsl       v22.8h, v16.8b, v7.8b
+    ld3         { v23.8h, v24.8h, v25.8h }, [x4], x10
+    sqadd       v16.8h, v20.8h, v23.8h
+    sqadd       v17.8h, v21.8h, v24.8h
+    sqadd       v18.8h, v22.8h, v25.8h
+    sqrshrun    v16.8b, v16.8h, 7
+    sqrshrun    v17.8b, v17.8h, 7
+    sqrshrun    v18.8b, v18.8h, 7
+    st3         { v16.8b, v17.8b, v18.8b }, [x0], x1
+    add         x2, x2, x3
+    subs        x5, x5, 1
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h32_8_neon, export=1
+    load_qpel_filterb x6, x7
+    sub         x2, x2, 3
+    mov         x10, 128
+    mov         x11, x7     // height
+1:  ld4         { v16.8b, v17.8b, v18.8b, v19.8b }, [x2]
+    ldr         x12, [x2, 32]
+    movi        v20.8h, 0
+    movi        v21.8h, 0
+    movi        v22.8h, 0
+    movi        v23.8h, 0
+    umlsl       v20.8h, v16.8b, v0.8b
+    umlal       v20.8h, v17.8b, v1.8b
+    umlsl       v20.8h, v18.8b, v2.8b
+    umlal       v20.8h, v19.8b, v3.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    lsr         x12, x12, 8
+    umlsl       v21.8h, v17.8b, v0.8b
+    umlal       v21.8h, v18.8b, v1.8b
+    umlsl       v21.8h, v19.8b, v2.8b
+    umlal       v21.8h, v16.8b, v3.8b
+    ushr        v17.2d, v17.2d, 8
+    mov         v17.b[7], w12
+    lsr         x12, x12, 8
+    umlsl       v22.8h, v18.8b, v0.8b
+    umlal       v22.8h, v19.8b, v1.8b
+    umlsl       v22.8h, v16.8b, v2.8b
+    umlal       v22.8h, v17.8b, v3.8b
+    ushr        v18.2d, v18.2d, 8
+    mov         v18.b[7], w12
+    lsr         x12, x12, 8
+    umlsl       v23.8h, v19.8b, v0.8b
+    umlal       v23.8h, v16.8b, v1.8b
+    umlsl       v23.8h, v17.8b, v2.8b
+    umlal       v23.8h, v18.8b, v3.8b
+    ushr        v19.2d, v19.2d, 8
+    mov         v19.b[7], w12
+    lsr         x12, x12, 8
+    umlal       v20.8h, v16.8b, v4.8b
+    umlsl       v20.8h, v17.8b, v5.8b
+    umlal       v20.8h, v18.8b, v6.8b
+    umlsl       v20.8h, v19.8b, v7.8b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    lsr         x12, x12, 8
+    umlal       v21.8h, v17.8b, v4.8b
+    umlsl       v21.8h, v18.8b, v5.8b
+    umlal       v21.8h, v19.8b, v6.8b
+    umlsl       v21.8h, v16.8b, v7.8b
+    ushr        v17.2d, v17.2d, 8
+    mov         v17.b[7], w12
+    lsr         x12, x12, 8
+    umlal       v22.8h, v18.8b, v4.8b
+    umlsl       v22.8h, v19.8b, v5.8b
+    umlal       v22.8h, v16.8b, v6.8b
+    umlsl       v22.8h, v17.8b, v7.8b
+    ushr        v18.2d, v18.2d, 8
+    mov         v18.b[7], w12
+    umlal       v23.8h, v19.8b, v4.8b
+    umlsl       v23.8h, v16.8b, v5.8b
+    umlal       v23.8h, v17.8b, v6.8b
+    umlsl       v23.8h, v18.8b, v7.8b
+    ld4         { v24.8h, v25.8h, v26.8h, v27.8h }, [x4], x10
+    sqadd       v16.8h, v20.8h, v24.8h
+    sqadd       v17.8h, v21.8h, v25.8h
+    sqadd       v18.8h, v22.8h, v26.8h
+    sqadd       v19.8h, v23.8h, v27.8h
+    sqrshrun    v16.8b, v16.8h, 7
+    sqrshrun    v17.8b, v17.8h, 7
+    sqrshrun    v18.8b, v18.8h, 7
+    sqrshrun    v19.8b, v19.8h, 7
+    st4         { v16.8b, v17.8b, v18.8b, v19.8b }, [x0], x1
+    add         x2, x2, x3
+    subs        x5, x5, 1
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h48_8_neon, export=1
+    load_qpel_filterb x6, x7
+    sub         x2, x2, 3
+    mov         x10, 80
+    mov         x11, x7     // height
+1:  ld3         { v16.16b, v17.16b, v18.16b }, [x2]
+    ldr         x12, [x2, 24]
+    ldr         x13, [x2, 48]
+    movi        v20.8h, 0
+    movi        v21.8h, 0
+    movi        v22.8h, 0
+    movi        v23.8h, 0
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    umlsl       v20.8h, v16.8b, v0.8b
+    umlal       v20.8h, v17.8b, v1.8b
+    umlsl       v20.8h, v18.8b, v2.8b
+    umlsl2      v23.8h, v16.16b, v0.16b
+    umlal2      v23.8h, v17.16b, v1.16b
+    umlsl2      v23.8h, v18.16b, v2.16b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    mov         v16.b[15], w13
+    lsr         x12, x12, 8
+    lsr         x13, x13, 8
+    umlsl       v21.8h, v17.8b, v0.8b
+    umlal       v21.8h, v18.8b, v1.8b
+    umlsl       v21.8h, v16.8b, v2.8b
+    umlsl2      v24.8h, v17.16b, v0.16b
+    umlal2      v24.8h, v18.16b, v1.16b
+    umlsl2      v24.8h, v16.16b, v2.16b
+    ushr        v17.2d, v17.2d, 8
+    mov         v17.b[7], w12
+    mov         v17.b[15], w13
+    lsr         x12, x12, 8
+    lsr         x13, x13, 8
+    umlsl       v22.8h, v18.8b, v0.8b
+    umlal       v22.8h, v16.8b, v1.8b
+    umlsl       v22.8h, v17.8b, v2.8b
+    umlsl2      v25.8h, v18.16b, v0.16b
+    umlal2      v25.8h, v16.16b, v1.16b
+    umlsl2      v25.8h, v17.16b, v2.16b
+    ushr        v18.2d, v18.2d, 8
+    mov         v18.b[7], w12
+    mov         v18.b[15], w13
+    lsr         x12, x12, 8
+    lsr         x13, x13, 8
+    umlal       v20.8h, v16.8b, v3.8b
+    umlal       v20.8h, v17.8b, v4.8b
+    umlsl       v20.8h, v18.8b, v5.8b
+    umlal2      v23.8h, v16.16b, v3.16b
+    umlal2      v23.8h, v17.16b, v4.16b
+    umlsl2      v23.8h, v18.16b, v5.16b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    mov         v16.b[15], w13
+    lsr         x12, x12, 8
+    lsr         x13, x13, 8
+    umlal       v21.8h, v17.8b, v3.8b
+    umlal       v21.8h, v18.8b, v4.8b
+    umlsl       v21.8h, v16.8b, v5.8b
+    umlal2      v24.8h, v17.16b, v3.16b
+    umlal2      v24.8h, v18.16b, v4.16b
+    umlsl2      v24.8h, v16.16b, v5.16b
+    ushr        v17.2d, v17.2d, 8
+    mov         v17.b[7], w12
+    mov         v17.b[15], w13
+    lsr         x12, x12, 8
+    lsr         x13, x13, 8
+    umlal       v22.8h, v18.8b, v3.8b
+    umlal       v22.8h, v16.8b, v4.8b
+    umlsl       v22.8h, v17.8b, v5.8b
+    umlal2      v25.8h, v18.16b, v3.16b
+    umlal2      v25.8h, v16.16b, v4.16b
+    umlsl2      v25.8h, v17.16b, v5.16b
+    ushr        v18.2d, v18.2d, 8
+    mov         v18.b[7], w12
+    mov         v18.b[15], w13
+    lsr         x12, x12, 8
+    lsr         x13, x13, 8
+    umlal       v20.8h, v16.8b, v6.8b
+    umlsl       v20.8h, v17.8b, v7.8b
+    umlal2      v23.8h, v16.16b, v6.16b
+    umlsl2      v23.8h, v17.16b, v7.16b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    mov         v16.b[15], w13
+    umlal       v21.8h, v17.8b, v6.8b
+    umlsl       v21.8h, v18.8b, v7.8b
+    umlal2      v24.8h, v17.16b, v6.16b
+    umlsl2      v24.8h, v18.16b, v7.16b
+    umlal       v22.8h, v18.8b, v6.8b
+    umlsl       v22.8h, v16.8b, v7.8b
+    umlal2      v25.8h, v18.16b, v6.16b
+    umlsl2      v25.8h, v16.16b, v7.16b
+    ld3         { v26.8h, v27.8h, v28.8h }, [x4], 48
+    sqadd       v16.8h, v20.8h, v26.8h
+    sqadd       v17.8h, v21.8h, v27.8h
+    sqadd       v18.8h, v22.8h, v28.8h
+    ld3         { v26.8h, v27.8h, v28.8h }, [x4], x10
+    sqadd       v19.8h, v23.8h, v26.8h
+    sqadd       v20.8h, v24.8h, v27.8h
+    sqadd       v21.8h, v25.8h, v28.8h
+    sqrshrun    v16.8b, v16.8h, 7
+    sqrshrun    v17.8b, v17.8h, 7
+    sqrshrun    v18.8b, v18.8h, 7
+    sqrshrun2   v16.16b, v19.8h, 7
+    sqrshrun2   v17.16b, v20.8h, 7
+    sqrshrun2   v18.16b, v21.8h, 7
+    st3         { v16.16b, v17.16b, v18.16b }, [x0], x1
+    add         x2, x2, x3
+    subs        x5, x5, 1
+    b.ne        1b
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h64_8_neon, export=1
+    load_qpel_filterb x6, x7
+    sub         x2, x2, 3
+1:  ld4         { v16.16b, v17.16b, v18.16b, v19.16b }, [x2]
+    ldr         x12, [x2, 32]
+    ldr         x13, [x2, 64]
+    movi        v20.8h, 0
+    movi        v21.8h, 0
+    movi        v22.8h, 0
+    movi        v23.8h, 0
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    movi        v26.8h, 0
+    movi        v27.8h, 0
+    umlsl       v20.8h, v16.8b, v0.8b
+    umlal       v20.8h, v17.8b, v1.8b
+    umlsl       v20.8h, v18.8b, v2.8b
+    umlal       v20.8h, v19.8b, v3.8b
+    umlsl2      v24.8h, v16.16b, v0.16b
+    umlal2      v24.8h, v17.16b, v1.16b
+    umlsl2      v24.8h, v18.16b, v2.16b
+    umlal2      v24.8h, v19.16b, v3.16b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    lsr         x12, x12, 8
+    mov         v16.b[15], w13
+    lsr         x13, x13, 8
+    umlsl       v21.8h, v17.8b, v0.8b
+    umlal       v21.8h, v18.8b, v1.8b
+    umlsl       v21.8h, v19.8b, v2.8b
+    umlal       v21.8h, v16.8b, v3.8b
+    umlsl2      v25.8h, v17.16b, v0.16b
+    umlal2      v25.8h, v18.16b, v1.16b
+    umlsl2      v25.8h, v19.16b, v2.16b
+    umlal2      v25.8h, v16.16b, v3.16b
+    ushr        v17.2d, v17.2d, 8
+    mov         v17.b[7], w12
+    lsr         x12, x12, 8
+    mov         v17.b[15], w13
+    lsr         x13, x13, 8
+    umlsl       v22.8h, v18.8b, v0.8b
+    umlal       v22.8h, v19.8b, v1.8b
+    umlsl       v22.8h, v16.8b, v2.8b
+    umlal       v22.8h, v17.8b, v3.8b
+    umlsl2      v26.8h, v18.16b, v0.16b
+    umlal2      v26.8h, v19.16b, v1.16b
+    umlsl2      v26.8h, v16.16b, v2.16b
+    umlal2      v26.8h, v17.16b, v3.16b
+    ushr        v18.2d, v18.2d, 8
+    mov         v18.b[7], w12
+    lsr         x12, x12, 8
+    mov         v18.b[15], w13
+    lsr         x13, x13, 8
+    umlsl       v23.8h, v19.8b, v0.8b
+    umlal       v23.8h, v16.8b, v1.8b
+    umlsl       v23.8h, v17.8b, v2.8b
+    umlal       v23.8h, v18.8b, v3.8b
+    umlsl2      v27.8h, v19.16b, v0.16b
+    umlal2      v27.8h, v16.16b, v1.16b
+    umlsl2      v27.8h, v17.16b, v2.16b
+    umlal2      v27.8h, v18.16b, v3.16b
+    ushr        v19.2d, v19.2d, 8
+    mov         v19.b[7], w12
+    lsr         x12, x12, 8
+    mov         v19.b[15], w13
+    lsr         x13, x13, 8
+    umlal       v20.8h, v16.8b, v4.8b
+    umlsl       v20.8h, v17.8b, v5.8b
+    umlal       v20.8h, v18.8b, v6.8b
+    umlsl       v20.8h, v19.8b, v7.8b
+    umlal2      v24.8h, v16.16b, v4.16b
+    umlsl2      v24.8h, v17.16b, v5.16b
+    umlal2      v24.8h, v18.16b, v6.16b
+    umlsl2      v24.8h, v19.16b, v7.16b
+    ushr        v16.2d, v16.2d, 8
+    mov         v16.b[7], w12
+    lsr         x12, x12, 8
+    mov         v16.b[15], w13
+    lsr         x13, x13, 8
+    umlal       v21.8h, v17.8b, v4.8b
+    umlsl       v21.8h, v18.8b, v5.8b
+    umlal       v21.8h, v19.8b, v6.8b
+    umlsl       v21.8h, v16.8b, v7.8b
+    umlal2      v25.8h, v17.16b, v4.16b
+    umlsl2      v25.8h, v18.16b, v5.16b
+    umlal2      v25.8h, v19.16b, v6.16b
+    umlsl2      v25.8h, v16.16b, v7.16b
+    ushr        v17.2d, v17.2d, 8
+    mov         v17.b[7], w12
+    lsr         x12, x12, 8
+    mov         v17.b[15], w13
+    lsr         x13, x13, 8
+    umlal       v22.8h, v18.8b, v4.8b
+    umlsl       v22.8h, v19.8b, v5.8b
+    umlal       v22.8h, v16.8b, v6.8b
+    umlsl       v22.8h, v17.8b, v7.8b
+    umlal2      v26.8h, v18.16b, v4.16b
+    umlsl2      v26.8h, v19.16b, v5.16b
+    umlal2      v26.8h, v16.16b, v6.16b
+    umlsl2      v26.8h, v17.16b, v7.16b
+    ushr        v18.2d, v18.2d, 8
+    mov         v18.b[7], w12
+    mov         v18.b[15], w13
+    umlal       v23.8h, v19.8b, v4.8b
+    umlsl       v23.8h, v16.8b, v5.8b
+    umlal       v23.8h, v17.8b, v6.8b
+    umlsl       v23.8h, v18.8b, v7.8b
+    umlal2      v27.8h, v19.16b, v4.16b
+    umlsl2      v27.8h, v16.16b, v5.16b
+    umlal2      v27.8h, v17.16b, v6.16b
+    umlsl2      v27.8h, v18.16b, v7.16b
+    ld4         { v28.8h, v29.8h, v30.8h, v31.8h }, [x4], 64
+    sqadd       v20.8h, v20.8h, v28.8h
+    sqadd       v21.8h, v21.8h, v29.8h
+    sqadd       v22.8h, v22.8h, v30.8h
+    sqadd       v23.8h, v23.8h, v31.8h
+    ld4         { v28.8h, v29.8h, v30.8h, v31.8h }, [x4], 64
+    sqadd       v24.8h, v24.8h, v28.8h
+    sqadd       v25.8h, v25.8h, v29.8h
+    sqadd       v26.8h, v26.8h, v30.8h
+    sqadd       v27.8h, v27.8h, v31.8h
+    sqrshrun    v16.8b, v20.8h, 7
+    sqrshrun    v17.8b, v21.8h, 7
+    sqrshrun    v18.8b, v22.8h, 7
+    sqrshrun    v19.8b, v23.8h, 7
+    sqrshrun2   v16.16b, v24.8h, 7
+    sqrshrun2   v17.16b, v25.8h, 7
+    sqrshrun2   v18.16b, v26.8h, 7
+    sqrshrun2   v19.16b, v27.8h, 7
+    st4         { v16.16b, v17.16b, v18.16b, v19.16b }, [x0], x1
+    add         x2, x2, x3
+    subs        x5, x5, 1
+    b.ne        1b
+    ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_qpel_bi_v4_8_neon, export=1
+    load_qpel_filterb x7, x6
+    sub         x2, x2, x3, lsl 1
+    sub         x2, x2, x3
+    mov         x12, 128
+    ld1         { v16.s }[0], [x2], x3
+    ld1         { v17.s }[0], [x2], x3
+    ld1         { v18.s }[0], [x2], x3
+    ld1         { v19.s }[0], [x2], x3
+    ld1         { v20.s }[0], [x2], x3
+    ld1         { v21.s }[0], [x2], x3
+    ld1         { v22.s }[0], [x2], x3
+1:  ld1         { v23.s }[0], [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v16, v17, v18, v19, v20, v21, v22, v23
+    ld1         { v25.4h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v25.8h
+    sqrshrun    v25.8b, v24.8h, 7
+    st1         { v25.s }[0], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v16.s }[0], [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v17, v18, v19, v20, v21, v22, v23, v16
+    ld1         { v25.4h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v25.8h
+    sqrshrun    v25.8b, v24.8h, 7
+    st1         { v25.s }[0], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v17.s }[0], [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v18, v19, v20, v21, v22, v23, v16, v17
+    ld1         { v25.4h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v25.8h
+    sqrshrun    v25.8b, v24.8h, 7
+    st1         { v25.s }[0], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v18.s }[0], [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v19, v20, v21, v22, v23, v16, v17, v18
+    ld1         { v25.4h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v25.8h
+    sqrshrun    v25.8b, v24.8h, 7
+    st1         { v25.s }[0], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v19.s }[0], [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v20, v21, v22, v23, v16, v17, v18, v19
+    ld1         { v25.4h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v25.8h
+    sqrshrun    v25.8b, v24.8h, 7
+    st1         { v25.s }[0], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v20.s }[0], [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v21, v22, v23, v16, v17, v18, v19, v20
+    ld1         { v25.4h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v25.8h
+    sqrshrun    v25.8b, v24.8h, 7
+    st1         { v25.s }[0], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v21.s }[0], [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v22, v23, v16, v17, v18, v19, v20, v21
+    ld1         { v25.4h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v25.8h
+    sqrshrun    v25.8b, v24.8h, 7
+    st1         { v25.s }[0], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v22.s }[0], [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v23, v16, v17, v18, v19, v20, v21, v22
+    ld1         { v25.4h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v25.8h
+    sqrshrun    v25.8b, v24.8h, 7
+    st1         { v25.s }[0], [x0], x1
+    subs        x5, x5, 1
+    b.hi        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v6_8_neon, export=1
+    load_qpel_filterb x7, x6
+    sub         x2, x2, x3, lsl 1
+    sub         x2, x2, x3
+    sub         x1, x1, 4
+    mov         x12, 128
+    ld1         { v16.8b }, [x2], x3
+    ld1         { v17.8b }, [x2], x3
+    ld1         { v18.8b }, [x2], x3
+    ld1         { v19.8b }, [x2], x3
+    ld1         { v20.8b }, [x2], x3
+    ld1         { v21.8b }, [x2], x3
+    ld1         { v22.8b }, [x2], x3
+1:  ld1         { v23.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v16, v17, v18, v19, v20, v21, v22, v23
+    ld1         { v25.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v25.8h
+    sqrshrun    v25.8b, v24.8h, 7
+    st1         { v25.s }[0], [x0], 4
+    st1         { v25.h }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v16.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v17, v18, v19, v20, v21, v22, v23, v16
+    ld1         { v25.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v25.8h
+    sqrshrun    v25.8b, v24.8h, 7
+    st1         { v25.s }[0], [x0], 4
+    st1         { v25.h }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v17.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v18, v19, v20, v21, v22, v23, v16, v17
+    ld1         { v25.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v25.8h
+    sqrshrun    v25.8b, v24.8h, 7
+    st1         { v25.s }[0], [x0], 4
+    st1         { v25.h }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v18.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v19, v20, v21, v22, v23, v16, v17, v18
+    ld1         { v25.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v25.8h
+    sqrshrun    v25.8b, v24.8h, 7
+    st1         { v25.s }[0], [x0], 4
+    st1         { v25.h }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v19.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v20, v21, v22, v23, v16, v17, v18, v19
+    ld1         { v25.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v25.8h
+    sqrshrun    v25.8b, v24.8h, 7
+    st1         { v25.s }[0], [x0], 4
+    st1         { v25.h }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v20.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v21, v22, v23, v16, v17, v18, v19, v20
+    ld1         { v25.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v25.8h
+    sqrshrun    v25.8b, v24.8h, 7
+    st1         { v25.s }[0], [x0], 4
+    st1         { v25.h }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v21.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v22, v23, v16, v17, v18, v19, v20, v21
+    ld1         { v25.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v25.8h
+    sqrshrun    v25.8b, v24.8h, 7
+    st1         { v25.s }[0], [x0], 4
+    st1         { v25.h }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v22.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v23, v16, v17, v18, v19, v20, v21, v22
+    ld1         { v25.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v25.8h
+    sqrshrun    v25.8b, v24.8h, 7
+    st1         { v25.s }[0], [x0], 4
+    st1         { v25.h }[2], [x0], x1
+    subs        x5, x5, 1
+    b.hi        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v8_8_neon, export=1
+    load_qpel_filterb x7, x6
+    sub         x2, x2, x3, lsl 1
+    sub         x2, x2, x3
+    mov         x12, 128
+    ld1         { v16.8b }, [x2], x3
+    ld1         { v17.8b }, [x2], x3
+    ld1         { v18.8b }, [x2], x3
+    ld1         { v19.8b }, [x2], x3
+    ld1         { v20.8b }, [x2], x3
+    ld1         { v21.8b }, [x2], x3
+    ld1         { v22.8b }, [x2], x3
+1:  ld1         { v23.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v16, v17, v18, v19, v20, v21, v22, v23
+    ld1         { v25.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v25.8h
+    sqrshrun    v25.8b, v24.8h, 7
+    st1         { v25.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v16.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v17, v18, v19, v20, v21, v22, v23, v16
+    ld1         { v25.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v25.8h
+    sqrshrun    v25.8b, v24.8h, 7
+    st1         { v25.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v17.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v18, v19, v20, v21, v22, v23, v16, v17
+    ld1         { v25.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v25.8h
+    sqrshrun    v25.8b, v24.8h, 7
+    st1         { v25.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v18.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v19, v20, v21, v22, v23, v16, v17, v18
+    ld1         { v25.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v25.8h
+    sqrshrun    v25.8b, v24.8h, 7
+    st1         { v25.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v19.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v20, v21, v22, v23, v16, v17, v18, v19
+    ld1         { v25.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v25.8h
+    sqrshrun    v25.8b, v24.8h, 7
+    st1         { v25.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v20.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v21, v22, v23, v16, v17, v18, v19, v20
+    ld1         { v25.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v25.8h
+    sqrshrun    v25.8b, v24.8h, 7
+    st1         { v25.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v21.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v22, v23, v16, v17, v18, v19, v20, v21
+    ld1         { v25.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v25.8h
+    sqrshrun    v25.8b, v24.8h, 7
+    st1         { v25.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v22.8b }, [x2], x3
+    movi        v24.8h, 0
+    calc_qpelb  v24, v23, v16, v17, v18, v19, v20, v21, v22
+    ld1         { v25.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v25.8h
+    sqrshrun    v25.8b, v24.8h, 7
+    st1         { v25.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.hi        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v12_8_neon, export=1
+    load_qpel_filterb x7, x6
+    sub         x2, x2, x3, lsl 1
+    sub         x2, x2, x3
+    sub         x1, x1, 8
+    mov         x12, 128
+    ld1         { v16.16b }, [x2], x3
+    ld1         { v17.16b }, [x2], x3
+    ld1         { v18.16b }, [x2], x3
+    ld1         { v19.16b }, [x2], x3
+    ld1         { v20.16b }, [x2], x3
+    ld1         { v21.16b }, [x2], x3
+    ld1         { v22.16b }, [x2], x3
+1:  ld1         { v23.16b }, [x2], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v16, v17, v18, v19, v20, v21, v22, v23
+    calc_qpelb2 v25, v16, v17, v18, v19, v20, v21, v22, v23
+    ld1         { v26.8h, v27.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v26.8h
+    sqadd       v25.8h, v25.8h, v27.8h
+    sqrshrun    v26.8b, v24.8h, 7
+    sqrshrun2   v26.16b, v25.8h, 7
+    st1         { v26.8b }, [x0], 8
+    st1         { v26.s }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v16.16b }, [x2], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v17, v18, v19, v20, v21, v22, v23, v16
+    calc_qpelb2 v25, v17, v18, v19, v20, v21, v22, v23, v16
+    ld1         { v26.8h, v27.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v26.8h
+    sqadd       v25.8h, v25.8h, v27.8h
+    sqrshrun    v26.8b, v24.8h, 7
+    sqrshrun2   v26.16b, v25.8h, 7
+    st1         { v26.8b }, [x0], 8
+    st1         { v26.s }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v17.16b }, [x2], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v18, v19, v20, v21, v22, v23, v16, v17
+    calc_qpelb2 v25, v18, v19, v20, v21, v22, v23, v16, v17
+    ld1         { v26.8h, v27.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v26.8h
+    sqadd       v25.8h, v25.8h, v27.8h
+    sqrshrun    v26.8b, v24.8h, 7
+    sqrshrun2   v26.16b, v25.8h, 7
+    st1         { v26.8b }, [x0], 8
+    st1         { v26.s }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v18.16b }, [x2], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v19, v20, v21, v22, v23, v16, v17, v18
+    calc_qpelb2 v25, v19, v20, v21, v22, v23, v16, v17, v18
+    ld1         { v26.8h, v27.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v26.8h
+    sqadd       v25.8h, v25.8h, v27.8h
+    sqrshrun    v26.8b, v24.8h, 7
+    sqrshrun2   v26.16b, v25.8h, 7
+    st1         { v26.8b }, [x0], 8
+    st1         { v26.s }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v19.16b }, [x2], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v20, v21, v22, v23, v16, v17, v18, v19
+    calc_qpelb2 v25, v20, v21, v22, v23, v16, v17, v18, v19
+    ld1         { v26.8h, v27.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v26.8h
+    sqadd       v25.8h, v25.8h, v27.8h
+    sqrshrun    v26.8b, v24.8h, 7
+    sqrshrun2   v26.16b, v25.8h, 7
+    st1         { v26.8b }, [x0], 8
+    st1         { v26.s }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v20.16b }, [x2], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v21, v22, v23, v16, v17, v18, v19, v20
+    calc_qpelb2 v25, v21, v22, v23, v16, v17, v18, v19, v20
+    ld1         { v26.8h, v27.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v26.8h
+    sqadd       v25.8h, v25.8h, v27.8h
+    sqrshrun    v26.8b, v24.8h, 7
+    sqrshrun2   v26.16b, v25.8h, 7
+    st1         { v26.8b }, [x0], 8
+    st1         { v26.s }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v21.16b }, [x2], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v22, v23, v16, v17, v18, v19, v20, v21
+    calc_qpelb2 v25, v22, v23, v16, v17, v18, v19, v20, v21
+    ld1         { v26.8h, v27.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v26.8h
+    sqadd       v25.8h, v25.8h, v27.8h
+    sqrshrun    v26.8b, v24.8h, 7
+    sqrshrun2   v26.16b, v25.8h, 7
+    st1         { v26.8b }, [x0], 8
+    st1         { v26.s }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v22.16b }, [x2], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v23, v16, v17, v18, v19, v20, v21, v22
+    calc_qpelb2 v25, v23, v16, v17, v18, v19, v20, v21, v22
+    ld1         { v26.8h, v27.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v26.8h
+    sqadd       v25.8h, v25.8h, v27.8h
+    sqrshrun    v26.8b, v24.8h, 7
+    sqrshrun2   v26.16b, v25.8h, 7
+    st1         { v26.8b }, [x0], 8
+    st1         { v26.s }[2], [x0], x1
+    subs        x5, x5, 1
+    b.hi        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v16_8_neon, export=1
+    load_qpel_filterb x7, x6
+    sub         x2, x2, x3, lsl 1
+    sub         x2, x2, x3
+    mov         x12, 128
+    ld1         { v16.16b }, [x2], x3
+    ld1         { v17.16b }, [x2], x3
+    ld1         { v18.16b }, [x2], x3
+    ld1         { v19.16b }, [x2], x3
+    ld1         { v20.16b }, [x2], x3
+    ld1         { v21.16b }, [x2], x3
+    ld1         { v22.16b }, [x2], x3
+1:  ld1         { v23.16b }, [x2], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v16, v17, v18, v19, v20, v21, v22, v23
+    calc_qpelb2 v25, v16, v17, v18, v19, v20, v21, v22, v23
+    ld1         { v26.8h, v27.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v26.8h
+    sqadd       v25.8h, v25.8h, v27.8h
+    sqrshrun    v26.8b, v24.8h, 7
+    sqrshrun2   v26.16b, v25.8h, 7
+    st1         { v26.16b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v16.16b }, [x2], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v17, v18, v19, v20, v21, v22, v23, v16
+    calc_qpelb2 v25, v17, v18, v19, v20, v21, v22, v23, v16
+    ld1         { v26.8h, v27.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v26.8h
+    sqadd       v25.8h, v25.8h, v27.8h
+    sqrshrun    v26.8b, v24.8h, 7
+    sqrshrun2   v26.16b, v25.8h, 7
+    st1         { v26.16b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v17.16b }, [x2], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v18, v19, v20, v21, v22, v23, v16, v17
+    calc_qpelb2 v25, v18, v19, v20, v21, v22, v23, v16, v17
+    ld1         { v26.8h, v27.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v26.8h
+    sqadd       v25.8h, v25.8h, v27.8h
+    sqrshrun    v26.8b, v24.8h, 7
+    sqrshrun2   v26.16b, v25.8h, 7
+    st1         { v26.16b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v18.16b }, [x2], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v19, v20, v21, v22, v23, v16, v17, v18
+    calc_qpelb2 v25, v19, v20, v21, v22, v23, v16, v17, v18
+    ld1         { v26.8h, v27.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v26.8h
+    sqadd       v25.8h, v25.8h, v27.8h
+    sqrshrun    v26.8b, v24.8h, 7
+    sqrshrun2   v26.16b, v25.8h, 7
+    st1         { v26.16b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v19.16b }, [x2], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v20, v21, v22, v23, v16, v17, v18, v19
+    calc_qpelb2 v25, v20, v21, v22, v23, v16, v17, v18, v19
+    ld1         { v26.8h, v27.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v26.8h
+    sqadd       v25.8h, v25.8h, v27.8h
+    sqrshrun    v26.8b, v24.8h, 7
+    sqrshrun2   v26.16b, v25.8h, 7
+    st1         { v26.16b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v20.16b }, [x2], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v21, v22, v23, v16, v17, v18, v19, v20
+    calc_qpelb2 v25, v21, v22, v23, v16, v17, v18, v19, v20
+    ld1         { v26.8h, v27.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v26.8h
+    sqadd       v25.8h, v25.8h, v27.8h
+    sqrshrun    v26.8b, v24.8h, 7
+    sqrshrun2   v26.16b, v25.8h, 7
+    st1         { v26.16b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v21.16b }, [x2], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v22, v23, v16, v17, v18, v19, v20, v21
+    calc_qpelb2 v25, v22, v23, v16, v17, v18, v19, v20, v21
+    ld1         { v26.8h, v27.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v26.8h
+    sqadd       v25.8h, v25.8h, v27.8h
+    sqrshrun    v26.8b, v24.8h, 7
+    sqrshrun2   v26.16b, v25.8h, 7
+    st1         { v26.16b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v22.16b }, [x2], x3
+    movi        v24.8h, 0
+    movi        v25.8h, 0
+    calc_qpelb  v24, v23, v16, v17, v18, v19, v20, v21, v22
+    calc_qpelb2 v25, v23, v16, v17, v18, v19, v20, v21, v22
+    ld1         { v26.8h, v27.8h }, [x4], x12   // src2
+    sqadd       v24.8h, v24.8h, v26.8h
+    sqadd       v25.8h, v25.8h, v27.8h
+    sqrshrun    v26.8b, v24.8h, 7
+    sqrshrun2   v26.16b, v25.8h, 7
+    st1         { v26.16b }, [x0], x1
+    subs        x5, x5, 1
+    b.hi        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v24_8_neon, export=1
+    stp         x7, x30, [sp, -16]!
+    stp         x0, x1, [sp, -16]!
+    stp         x2, x3, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    bl          ff_hevc_put_hevc_qpel_bi_v16_8_neon
+    ldp         x4, x5, [sp], 16
+    ldp         x2, x3, [sp], 16
+    ldp         x0, x1, [sp], 16
+    ldr         x7, [sp]
+    add         x0, x0, 16
+    add         x2, x2, 16
+    add         x4, x4, 32
+    bl          ff_hevc_put_hevc_qpel_bi_v8_8_neon
+    ldp         xzr, x30, [sp], 16
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v32_8_neon, export=1
+    sub         sp, sp, 64
+    st1         { v12.16b, v13.16b, v14.16b, v15.16b }, [sp]
+    sub         sp, sp, 64
+    st1         { v8.16b, v9.16b, v10.16b, v11.16b }, [sp]
+    load_qpel_filterb x7, x6
+    sub         x2, x2, x3, lsl 1
+    sub         x2, x2, x3
+    ldr         w6, [sp, 128]
+    mov         x12, 128
+1:  mov         x11, x5     // height
+    mov         x10, x0     // dst
+    mov         x8, x2      // src
+    mov         x9, x4      // src2
+
+    ld1         { v16.16b, v17.16b }, [x8], x3
+    ld1         { v18.16b, v19.16b }, [x8], x3
+    ld1         { v20.16b, v21.16b }, [x8], x3
+    ld1         { v22.16b, v23.16b }, [x8], x3
+    ld1         { v24.16b, v25.16b }, [x8], x3
+    ld1         { v26.16b, v27.16b }, [x8], x3
+    ld1         { v28.16b, v29.16b }, [x8], x3
+2:  ld1         { v30.16b, v31.16b }, [x8], x3
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_qpelb  v8,  v16, v18, v20, v22, v24, v26, v28, v30
+    calc_qpelb2 v9,  v16, v18, v20, v22, v24, v26, v28, v30
+    calc_qpelb  v10, v17, v19, v21, v23, v25, v27, v29, v31
+    calc_qpelb2 v11, v17, v19, v21, v23, v25, v27, v29, v31
+    ld1         { v12.8h, v13.8h, v14.8h, v15.8h }, [x9], x12   // src2
+    sqadd       v8.8h, v8.8h, v12.8h
+    sqadd       v9.8h, v9.8h, v13.8h
+    sqadd       v10.8h, v10.8h, v14.8h
+    sqadd       v11.8h, v11.8h, v15.8h
+    sqrshrun    v12.8b, v8.8h, 7
+    sqrshrun2   v12.16b, v9.8h, 7
+    sqrshrun    v13.8b, v10.8h, 7
+    sqrshrun2   v13.16b, v11.8h, 7
+    st1         { v12.16b, v13.16b }, [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v16.16b, v17.16b }, [x8], x3
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_qpelb  v8,  v18, v20, v22, v24, v26, v28, v30, v16
+    calc_qpelb2 v9,  v18, v20, v22, v24, v26, v28, v30, v16
+    calc_qpelb  v10, v19, v21, v23, v25, v27, v29, v31, v17
+    calc_qpelb2 v11, v19, v21, v23, v25, v27, v29, v31, v17
+    ld1         { v12.8h, v13.8h, v14.8h, v15.8h }, [x9], x12   // src2
+    sqadd       v8.8h, v8.8h, v12.8h
+    sqadd       v9.8h, v9.8h, v13.8h
+    sqadd       v10.8h, v10.8h, v14.8h
+    sqadd       v11.8h, v11.8h, v15.8h
+    sqrshrun    v12.8b, v8.8h, 7
+    sqrshrun2   v12.16b, v9.8h, 7
+    sqrshrun    v13.8b, v10.8h, 7
+    sqrshrun2   v13.16b, v11.8h, 7
+    st1         { v12.16b, v13.16b }, [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v18.16b, v19.16b }, [x8], x3
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_qpelb  v8,  v20, v22, v24, v26, v28, v30, v16, v18
+    calc_qpelb2 v9,  v20, v22, v24, v26, v28, v30, v16, v18
+    calc_qpelb  v10, v21, v23, v25, v27, v29, v31, v17, v19
+    calc_qpelb2 v11, v21, v23, v25, v27, v29, v31, v17, v19
+    ld1         { v12.8h, v13.8h, v14.8h, v15.8h }, [x9], x12   // src2
+    sqadd       v8.8h, v8.8h, v12.8h
+    sqadd       v9.8h, v9.8h, v13.8h
+    sqadd       v10.8h, v10.8h, v14.8h
+    sqadd       v11.8h, v11.8h, v15.8h
+    sqrshrun    v12.8b, v8.8h, 7
+    sqrshrun2   v12.16b, v9.8h, 7
+    sqrshrun    v13.8b, v10.8h, 7
+    sqrshrun2   v13.16b, v11.8h, 7
+    st1         { v12.16b, v13.16b }, [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v20.16b, v21.16b }, [x8], x3
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_qpelb  v8,  v22, v24, v26, v28, v30, v16, v18, v20
+    calc_qpelb2 v9,  v22, v24, v26, v28, v30, v16, v18, v20
+    calc_qpelb  v10, v23, v25, v27, v29, v31, v17, v19, v21
+    calc_qpelb2 v11, v23, v25, v27, v29, v31, v17, v19, v21
+    ld1         { v12.8h, v13.8h, v14.8h, v15.8h }, [x9], x12   // src2
+    sqadd       v8.8h, v8.8h, v12.8h
+    sqadd       v9.8h, v9.8h, v13.8h
+    sqadd       v10.8h, v10.8h, v14.8h
+    sqadd       v11.8h, v11.8h, v15.8h
+    sqrshrun    v12.8b, v8.8h, 7
+    sqrshrun2   v12.16b, v9.8h, 7
+    sqrshrun    v13.8b, v10.8h, 7
+    sqrshrun2   v13.16b, v11.8h, 7
+    st1         { v12.16b, v13.16b }, [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v22.16b, v23.16b }, [x8], x3
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_qpelb  v8,  v24, v26, v28, v30, v16, v18, v20, v22
+    calc_qpelb2 v9,  v24, v26, v28, v30, v16, v18, v20, v22
+    calc_qpelb  v10, v25, v27, v29, v31, v17, v19, v21, v23
+    calc_qpelb2 v11, v25, v27, v29, v31, v17, v19, v21, v23
+    ld1         { v12.8h, v13.8h, v14.8h, v15.8h }, [x9], x12   // src2
+    sqadd       v8.8h, v8.8h, v12.8h
+    sqadd       v9.8h, v9.8h, v13.8h
+    sqadd       v10.8h, v10.8h, v14.8h
+    sqadd       v11.8h, v11.8h, v15.8h
+    sqrshrun    v12.8b, v8.8h, 7
+    sqrshrun2   v12.16b, v9.8h, 7
+    sqrshrun    v13.8b, v10.8h, 7
+    sqrshrun2   v13.16b, v11.8h, 7
+    st1         { v12.16b, v13.16b }, [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v24.16b, v25.16b }, [x8], x3
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_qpelb  v8,  v26, v28, v30, v16, v18, v20, v22, v24
+    calc_qpelb2 v9,  v26, v28, v30, v16, v18, v20, v22, v24
+    calc_qpelb  v10, v27, v29, v31, v17, v19, v21, v23, v25
+    calc_qpelb2 v11, v27, v29, v31, v17, v19, v21, v23, v25
+    ld1         { v12.8h, v13.8h, v14.8h, v15.8h }, [x9], x12   // src2
+    sqadd       v8.8h, v8.8h, v12.8h
+    sqadd       v9.8h, v9.8h, v13.8h
+    sqadd       v10.8h, v10.8h, v14.8h
+    sqadd       v11.8h, v11.8h, v15.8h
+    sqrshrun    v12.8b, v8.8h, 7
+    sqrshrun2   v12.16b, v9.8h, 7
+    sqrshrun    v13.8b, v10.8h, 7
+    sqrshrun2   v13.16b, v11.8h, 7
+    st1         { v12.16b, v13.16b }, [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v26.16b, v27.16b }, [x8], x3
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_qpelb  v8,  v28, v30, v16, v18, v20, v22, v24, v26
+    calc_qpelb2 v9,  v28, v30, v16, v18, v20, v22, v24, v26
+    calc_qpelb  v10, v29, v31, v17, v19, v21, v23, v25, v27
+    calc_qpelb2 v11, v29, v31, v17, v19, v21, v23, v25, v27
+    ld1         { v12.8h, v13.8h, v14.8h, v15.8h }, [x9], x12   // src2
+    sqadd       v8.8h, v8.8h, v12.8h
+    sqadd       v9.8h, v9.8h, v13.8h
+    sqadd       v10.8h, v10.8h, v14.8h
+    sqadd       v11.8h, v11.8h, v15.8h
+    sqrshrun    v12.8b, v8.8h, 7
+    sqrshrun2   v12.16b, v9.8h, 7
+    sqrshrun    v13.8b, v10.8h, 7
+    sqrshrun2   v13.16b, v11.8h, 7
+    st1         { v12.16b, v13.16b }, [x10], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v28.16b, v29.16b }, [x8], x3
+    movi        v8.8h, 0
+    movi        v9.8h, 0
+    movi        v10.8h, 0
+    movi        v11.8h, 0
+    calc_qpelb  v8,  v30, v16, v18, v20, v22, v24, v26, v28
+    calc_qpelb2 v9,  v30, v16, v18, v20, v22, v24, v26, v28
+    calc_qpelb  v10, v31, v17, v19, v21, v23, v25, v27, v29
+    calc_qpelb2 v11, v31, v17, v19, v21, v23, v25, v27, v29
+    ld1         { v12.8h, v13.8h, v14.8h, v15.8h }, [x9], x12   // src2
+    sqadd       v8.8h, v8.8h, v12.8h
+    sqadd       v9.8h, v9.8h, v13.8h
+    sqadd       v10.8h, v10.8h, v14.8h
+    sqadd       v11.8h, v11.8h, v15.8h
+    sqrshrun    v12.8b, v8.8h, 7
+    sqrshrun2   v12.16b, v9.8h, 7
+    sqrshrun    v13.8b, v10.8h, 7
+    sqrshrun2   v13.16b, v11.8h, 7
+    st1         { v12.16b, v13.16b }, [x10], x1
+    subs        x11, x11, 1
+    b.hi        2b
+
+3:  add         x0, x0, 32      // dst
+    add         x2, x2, 32      // src
+    add         x4, x4, 64      // src2
+    subs        x6, x6, 32
+    b.ne        1b
+    ld1         { v8.16b, v9.16b, v10.16b, v11.16b }, [sp], 64
+    ld1         { v12.16b, v13.16b, v14.16b, v15.16b }, [sp], 64
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v48_8_neon, export=1
+    stp         x7, x30, [sp, -16]!
+    stp         x0, x1, [sp, -16]!
+    stp         x2, x3, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    mov         x8, 32
+    stp         x8, x8, [sp, -16]!
+    bl          ff_hevc_put_hevc_qpel_bi_v32_8_neon
+    ldp         x8, xzr, [sp], 16
+    ldp         x4, x5, [sp], 16
+    ldp         x2, x3, [sp], 16
+    ldp         x0, x1, [sp], 16
+    ldr         x7, [sp]
+    add         x0, x0, 32
+    add         x2, x2, 32
+    add         x4, x4, 64
+    bl          ff_hevc_put_hevc_qpel_bi_v16_8_neon
+    ldp         xzr, x30, [sp], 16
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v64_8_neon, export=1
+    b ff_hevc_put_hevc_qpel_bi_v32_8_neon
+endfunc
+
+
+
+function ff_hevc_put_hevc_qpel_bi_hv4_8_neon, export=1
+    add         x10, x5, 7
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    stp         x7, x30, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3, lsl 1
+    sub         x1, x1, x3
+    mov         x2, x3
+    add         x3, x5, 7
+    mov         x4, x6
+    bl          ff_hevc_put_hevc_qpel_h4_8_neon
+    ldp         x7, x30, [sp], 16
+    ldp         x4, x5, [sp], 16
+    ldp         x0, x1, [sp], 16
+    load_qpel_filterh x7, x6
+    mov         x9, 128
+    ld1         { v16.4h }, [sp], x9
+    ld1         { v17.4h }, [sp], x9
+    ld1         { v18.4h }, [sp], x9
+    ld1         { v19.4h }, [sp], x9
+    ld1         { v20.4h }, [sp], x9
+    ld1         { v21.4h }, [sp], x9
+    ld1         { v22.4h }, [sp], x9
+1:  ld1         { v23.4h }, [sp], x9
+    calc_qpelh  v1, v16, v17, v18, v19, v20, v21, v22, v23, sshr
+    ld1         { v5.4h }, [x4], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    rshrn       v1.4h, v1.4s, 7
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v16.8h }, [sp], x9
+    calc_qpelh  v1, v17, v18, v19, v20, v21, v22, v23, v16, sshr
+    ld1         { v5.4h }, [x4], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    rshrn       v1.4h, v1.4s, 7
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v17.8h }, [sp], x9
+    calc_qpelh  v1, v18, v19, v20, v21, v22, v23, v16, v17, sshr
+    ld1         { v5.4h }, [x4], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    rshrn       v1.4h, v1.4s, 7
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v18.8h }, [sp], x9
+    calc_qpelh  v1, v19, v20, v21, v22, v23, v16, v17, v18, sshr
+    ld1         { v5.4h }, [x4], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    rshrn       v1.4h, v1.4s, 7
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v19.8h }, [sp], x9
+    calc_qpelh  v1, v20, v21, v22, v23, v16, v17, v18, v19, sshr
+    ld1         { v5.4h }, [x4], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    rshrn       v1.4h, v1.4s, 7
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v20.8h }, [sp], x9
+    calc_qpelh  v1, v21, v22, v23, v16, v17, v18, v19, v20, sshr
+    ld1         { v5.4h }, [x4], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    rshrn       v1.4h, v1.4s, 7
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v21.8h }, [sp], x9
+    calc_qpelh  v1, v22, v23, v16, v17, v18, v19, v20, v21, sshr
+    ld1         { v5.4h }, [x4], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    rshrn       v1.4h, v1.4s, 7
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v22.8h }, [sp], x9
+    calc_qpelh  v1, v23, v16, v17, v18, v19, v20, v21, v22, sshr
+    ld1         { v5.4h }, [x4], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    rshrn       v1.4h, v1.4s, 7
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], x1
+    subs        x5, x5, 1
+    b.hi        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv6_8_neon, export=1
+    add         x10, x5, 7
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    stp         x7, x30, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3, lsl 1
+    sub         x1, x1, x3
+    mov         x2, x3
+    add         x3, x5, 7
+    mov         x4, x6
+    bl          ff_hevc_put_hevc_qpel_h6_8_neon
+    ldp         x7, x30, [sp], 16
+    ldp         x4, x5, [sp], 16
+    ldp         x0, x1, [sp], 16
+    load_qpel_filterh x7, x6
+    sub         x1, x1, 4
+    mov         x9, 128
+    ld1         { v16.8h }, [sp], x9
+    ld1         { v17.8h }, [sp], x9
+    ld1         { v18.8h }, [sp], x9
+    ld1         { v19.8h }, [sp], x9
+    ld1         { v20.8h }, [sp], x9
+    ld1         { v21.8h }, [sp], x9
+    ld1         { v22.8h }, [sp], x9
+1:  ld1         { v23.8h }, [sp], x9
+    calc_qpelh  v1, v16, v17, v18, v19, v20, v21, v22, v23, sshr
+    calc_qpelh2 v2, v2, v16, v17, v18, v19, v20, v21, v22, v23, sshr
+    ld1         { v5.8h }, [x4], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    saddw2      v2.4s, v2.4s, v5.8h
+    rshrn       v1.4h, v1.4s, 7
+    rshrn2      v1.8h, v2.4s, 7
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], 4
+    st1         { v1.h }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v16.8h }, [sp], x9
+    calc_qpelh  v1, v17, v18, v19, v20, v21, v22, v23, v16, sshr
+    calc_qpelh2 v2, v2, v17, v18, v19, v20, v21, v22, v23, v16, sshr
+    ld1         { v5.8h }, [x4], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    saddw2      v2.4s, v2.4s, v5.8h
+    rshrn       v1.4h, v1.4s, 7
+    rshrn2      v1.8h, v2.4s, 7
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], 4
+    st1         { v1.h }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v17.8h }, [sp], x9
+    calc_qpelh  v1, v18, v19, v20, v21, v22, v23, v16, v17, sshr
+    calc_qpelh2 v2, v2, v18, v19, v20, v21, v22, v23, v16, v17, sshr
+    ld1         { v5.8h }, [x4], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    saddw2      v2.4s, v2.4s, v5.8h
+    rshrn       v1.4h, v1.4s, 7
+    rshrn2      v1.8h, v2.4s, 7
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], 4
+    st1         { v1.h }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v18.8h }, [sp], x9
+    calc_qpelh  v1, v19, v20, v21, v22, v23, v16, v17, v18, sshr
+    calc_qpelh2 v2, v2, v19, v20, v21, v22, v23, v16, v17, v18, sshr
+    ld1         { v5.8h }, [x4], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    saddw2      v2.4s, v2.4s, v5.8h
+    rshrn       v1.4h, v1.4s, 7
+    rshrn2      v1.8h, v2.4s, 7
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], 4
+    st1         { v1.h }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v19.8h }, [sp], x9
+    calc_qpelh  v1, v20, v21, v22, v23, v16, v17, v18, v19, sshr
+    calc_qpelh2 v2, v2, v20, v21, v22, v23, v16, v17, v18, v19, sshr
+    ld1         { v5.8h }, [x4], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    saddw2      v2.4s, v2.4s, v5.8h
+    rshrn       v1.4h, v1.4s, 7
+    rshrn2      v1.8h, v2.4s, 7
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], 4
+    st1         { v1.h }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v20.8h }, [sp], x9
+    calc_qpelh  v1, v21, v22, v23, v16, v17, v18, v19, v20, sshr
+    calc_qpelh2 v2, v2, v21, v22, v23, v16, v17, v18, v19, v20, sshr
+    ld1         { v5.8h }, [x4], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    saddw2      v2.4s, v2.4s, v5.8h
+    rshrn       v1.4h, v1.4s, 7
+    rshrn2      v1.8h, v2.4s, 7
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], 4
+    st1         { v1.h }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v21.8h }, [sp], x9
+    calc_qpelh  v1, v22, v23, v16, v17, v18, v19, v20, v21, sshr
+    calc_qpelh2 v2, v2, v22, v23, v16, v17, v18, v19, v20, v21, sshr
+    ld1         { v5.8h }, [x4], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    saddw2      v2.4s, v2.4s, v5.8h
+    rshrn       v1.4h, v1.4s, 7
+    rshrn2      v1.8h, v2.4s, 7
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], 4
+    st1         { v1.h }[2], [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v22.8h }, [sp], x9
+    calc_qpelh  v1, v23, v16, v17, v18, v19, v20, v21, v22, sshr
+    calc_qpelh2 v2, v2, v23, v16, v17, v18, v19, v20, v21, v22, sshr
+    ld1         { v5.8h }, [x4], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    saddw2      v2.4s, v2.4s, v5.8h
+    rshrn       v1.4h, v1.4s, 7
+    rshrn2      v1.8h, v2.4s, 7
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.s }[0], [x0], 4
+    st1         { v1.h }[2], [x0], x1
+    subs        x5, x5, 1
+    b.hi        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv8_8_neon, export=1
+    add         x10, x5, 7
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    stp         x7, x30, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3, lsl 1
+    sub         x1, x1, x3
+    mov         x2, x3
+    add         x3, x5, 7
+    mov         x4, x6
+    bl          ff_hevc_put_hevc_qpel_h8_8_neon
+    ldp         x7, x30, [sp], 16
+    ldp         x4, x5, [sp], 16
+    ldp         x0, x1, [sp], 16
+    load_qpel_filterh x7, x6
+    mov         x9, 128
+    ld1         { v16.8h }, [sp], x9
+    ld1         { v17.8h }, [sp], x9
+    ld1         { v18.8h }, [sp], x9
+    ld1         { v19.8h }, [sp], x9
+    ld1         { v20.8h }, [sp], x9
+    ld1         { v21.8h }, [sp], x9
+    ld1         { v22.8h }, [sp], x9
+1:  ld1         { v23.8h }, [sp], x9
+    calc_qpelh  v1, v16, v17, v18, v19, v20, v21, v22, v23, sshr
+    calc_qpelh2 v2, v2, v16, v17, v18, v19, v20, v21, v22, v23, sshr
+    ld1         { v5.8h }, [x4], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    saddw2      v2.4s, v2.4s, v5.8h
+    rshrn       v1.4h, v1.4s, 7
+    rshrn2      v1.8h, v2.4s, 7
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v16.8h }, [sp], x9
+    calc_qpelh  v1, v17, v18, v19, v20, v21, v22, v23, v16, sshr
+    calc_qpelh2 v2, v2, v17, v18, v19, v20, v21, v22, v23, v16, sshr
+    ld1         { v5.8h }, [x4], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    saddw2      v2.4s, v2.4s, v5.8h
+    rshrn       v1.4h, v1.4s, 7
+    rshrn2      v1.8h, v2.4s, 7
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v17.8h }, [sp], x9
+    calc_qpelh  v1, v18, v19, v20, v21, v22, v23, v16, v17, sshr
+    calc_qpelh2 v2, v2, v18, v19, v20, v21, v22, v23, v16, v17, sshr
+    ld1         { v5.8h }, [x4], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    saddw2      v2.4s, v2.4s, v5.8h
+    rshrn       v1.4h, v1.4s, 7
+    rshrn2      v1.8h, v2.4s, 7
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v18.8h }, [sp], x9
+    calc_qpelh  v1, v19, v20, v21, v22, v23, v16, v17, v18, sshr
+    calc_qpelh2 v2, v2, v19, v20, v21, v22, v23, v16, v17, v18, sshr
+    ld1         { v5.8h }, [x4], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    saddw2      v2.4s, v2.4s, v5.8h
+    rshrn       v1.4h, v1.4s, 7
+    rshrn2      v1.8h, v2.4s, 7
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v19.8h }, [sp], x9
+    calc_qpelh  v1, v20, v21, v22, v23, v16, v17, v18, v19, sshr
+    calc_qpelh2 v2, v2, v20, v21, v22, v23, v16, v17, v18, v19, sshr
+    ld1         { v5.8h }, [x4], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    saddw2      v2.4s, v2.4s, v5.8h
+    rshrn       v1.4h, v1.4s, 7
+    rshrn2      v1.8h, v2.4s, 7
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v20.8h }, [sp], x9
+    calc_qpelh  v1, v21, v22, v23, v16, v17, v18, v19, v20, sshr
+    calc_qpelh2 v2, v2, v21, v22, v23, v16, v17, v18, v19, v20, sshr
+    ld1         { v5.8h }, [x4], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    saddw2      v2.4s, v2.4s, v5.8h
+    rshrn       v1.4h, v1.4s, 7
+    rshrn2      v1.8h, v2.4s, 7
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v21.8h }, [sp], x9
+    calc_qpelh  v1, v22, v23, v16, v17, v18, v19, v20, v21, sshr
+    calc_qpelh2 v2, v2, v22, v23, v16, v17, v18, v19, v20, v21, sshr
+    ld1         { v5.8h }, [x4], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    saddw2      v2.4s, v2.4s, v5.8h
+    rshrn       v1.4h, v1.4s, 7
+    rshrn2      v1.8h, v2.4s, 7
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.eq        2f
+
+    ld1         { v22.8h }, [sp], x9
+    calc_qpelh  v1, v23, v16, v17, v18, v19, v20, v21, v22, sshr
+    calc_qpelh2 v2, v2, v23, v16, v17, v18, v19, v20, v21, v22, sshr
+    ld1         { v5.8h }, [x4], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    saddw2      v2.4s, v2.4s, v5.8h
+    rshrn       v1.4h, v1.4s, 7
+    rshrn2      v1.8h, v2.4s, 7
+    sqxtun      v1.8b, v1.8h
+    st1         { v1.8b }, [x0], x1
+    subs        x5, x5, 1
+    b.hi        1b
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv12_8_neon, export=1
+    stp         xzr, x30, [sp, -16]!
+    stp         x0, x1, [sp, -16]!
+    stp         x2, x3, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    stp         x6, x7, [sp, -16]!
+    bl          ff_hevc_put_hevc_qpel_bi_hv8_8_neon
+    ldp         x6, x7, [sp], 16
+    ldp         x4, x5, [sp], 16
+    ldp         x2, x3, [sp], 16
+    ldp         x0, x1, [sp], 16
+    add         x0, x0, 8
+    add         x2, x2, 8
+    add         x4, x4, 16
+    bl          ff_hevc_put_hevc_qpel_bi_hv4_8_neon
+    ldp         xzr, x30, [sp], 16
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv16_8_neon, export=1
+    add         x10, x5, 7
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    stp         x7, x30, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3, lsl 1
+    sub         x1, x1, x3
+    mov         x2, x3
+    add         x3, x5, 7
+    mov         x4, x6
+    bl          ff_hevc_put_hevc_qpel_h16_8_neon
+    ldp         x7, x30, [sp], 16
+    ldp         x4, x5, [sp], 16
+    ldp         x0, x1, [sp], 16
+    mov         x6, 16      // width
+.Lqpel_bi_hv16_loop:
+    load_qpel_filterh x7, x8
+    mov         x9, 128
+    mov         x10, x6
+
+1:  mov         x11, x5     // height
+    mov         x7, x0      // dst
+    mov         x8, sp      // src
+    mov         x12, x4     // src2
+
+    ld1         { v16.8h, v17.8h }, [x8], x9
+    ld1         { v18.8h, v19.8h }, [x8], x9
+    ld1         { v20.8h, v21.8h }, [x8], x9
+    ld1         { v22.8h, v23.8h }, [x8], x9
+    ld1         { v24.8h, v25.8h }, [x8], x9
+    ld1         { v26.8h, v27.8h }, [x8], x9
+    ld1         { v28.8h, v29.8h }, [x8], x9
+2:  ld1         { v30.8h, v31.8h }, [x8], x9
+    calc_qpelh  v1, v16, v18, v20, v22, v24, v26, v28, v30, sshr
+    calc_qpelh2 v2, v2, v16, v18, v20, v22, v24, v26, v28, v30, sshr
+    calc_qpelh  v3, v17, v19, v21, v23, v25, v27, v29, v31, sshr
+    calc_qpelh2 v4, v4, v17, v19, v21, v23, v25, v27, v29, v31, sshr
+    ld1         { v5.8h, v6.8h }, [x12], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    saddw2      v2.4s, v2.4s, v5.8h
+    saddw       v3.4s, v3.4s, v6.4h
+    saddw2      v4.4s, v4.4s, v6.8h
+    rshrn       v1.4h, v1.4s, 7
+    rshrn2      v1.8h, v2.4s, 7
+    rshrn       v2.4h, v3.4s, 7
+    rshrn2      v2.8h, v4.4s, 7
+    sqxtun      v1.8b, v1.8h
+    sqxtun2     v1.16b, v2.8h
+    st1         { v1.16b }, [x7], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v16.8h, v17.8h }, [x8], x9
+    calc_qpelh  v1, v18, v20, v22, v24, v26, v28, v30, v16, sshr
+    calc_qpelh2 v2, v2, v18, v20, v22, v24, v26, v28, v30, v16, sshr
+    calc_qpelh  v3, v19, v21, v23, v25, v27, v29, v31, v17, sshr
+    calc_qpelh2 v4, v4, v19, v21, v23, v25, v27, v29, v31, v17, sshr
+    ld1         { v5.8h, v6.8h }, [x12], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    saddw2      v2.4s, v2.4s, v5.8h
+    saddw       v3.4s, v3.4s, v6.4h
+    saddw2      v4.4s, v4.4s, v6.8h
+    rshrn       v1.4h, v1.4s, 7
+    rshrn2      v1.8h, v2.4s, 7
+    rshrn       v2.4h, v3.4s, 7
+    rshrn2      v2.8h, v4.4s, 7
+    sqxtun      v1.8b, v1.8h
+    sqxtun2     v1.16b, v2.8h
+    st1         { v1.16b }, [x7], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v18.8h, v19.8h }, [x8], x9
+    calc_qpelh  v1, v20, v22, v24, v26, v28, v30, v16, v18, sshr
+    calc_qpelh2 v2, v2, v20, v22, v24, v26, v28, v30, v16, v18, sshr
+    calc_qpelh  v3, v21, v23, v25, v27, v29, v31, v17, v19, sshr
+    calc_qpelh2 v4, v4, v21, v23, v25, v27, v29, v31, v17, v19, sshr
+    ld1         { v5.8h, v6.8h }, [x12], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    saddw2      v2.4s, v2.4s, v5.8h
+    saddw       v3.4s, v3.4s, v6.4h
+    saddw2      v4.4s, v4.4s, v6.8h
+    rshrn       v1.4h, v1.4s, 7
+    rshrn2      v1.8h, v2.4s, 7
+    rshrn       v2.4h, v3.4s, 7
+    rshrn2      v2.8h, v4.4s, 7
+    sqxtun      v1.8b, v1.8h
+    sqxtun2     v1.16b, v2.8h
+    st1         { v1.16b }, [x7], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v20.8h, v21.8h }, [x8], x9
+    calc_qpelh  v1, v22, v24, v26, v28, v30, v16, v18, v20, sshr
+    calc_qpelh2 v2, v2, v22, v24, v26, v28, v30, v16, v18, v20, sshr
+    calc_qpelh  v3, v23, v25, v27, v29, v31, v17, v19, v21, sshr
+    calc_qpelh2 v4, v4, v23, v25, v27, v29, v31, v17, v19, v21, sshr
+    ld1         { v5.8h, v6.8h }, [x12], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    saddw2      v2.4s, v2.4s, v5.8h
+    saddw       v3.4s, v3.4s, v6.4h
+    saddw2      v4.4s, v4.4s, v6.8h
+    rshrn       v1.4h, v1.4s, 7
+    rshrn2      v1.8h, v2.4s, 7
+    rshrn       v2.4h, v3.4s, 7
+    rshrn2      v2.8h, v4.4s, 7
+    sqxtun      v1.8b, v1.8h
+    sqxtun2     v1.16b, v2.8h
+    st1         { v1.16b }, [x7], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v22.8h, v23.8h }, [x8], x9
+    calc_qpelh  v1, v24, v26, v28, v30, v16, v18, v20, v22, sshr
+    calc_qpelh2 v2, v2, v24, v26, v28, v30, v16, v18, v20, v22, sshr
+    calc_qpelh  v3, v25, v27, v29, v31, v17, v19, v21, v23, sshr
+    calc_qpelh2 v4, v4, v25, v27, v29, v31, v17, v19, v21, v23, sshr
+    ld1         { v5.8h, v6.8h }, [x12], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    saddw2      v2.4s, v2.4s, v5.8h
+    saddw       v3.4s, v3.4s, v6.4h
+    saddw2      v4.4s, v4.4s, v6.8h
+    rshrn       v1.4h, v1.4s, 7
+    rshrn2      v1.8h, v2.4s, 7
+    rshrn       v2.4h, v3.4s, 7
+    rshrn2      v2.8h, v4.4s, 7
+    sqxtun      v1.8b, v1.8h
+    sqxtun2     v1.16b, v2.8h
+    st1         { v1.16b }, [x7], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v24.8h, v25.8h }, [x8], x9
+    calc_qpelh  v1, v26, v28, v30, v16, v18, v20, v22, v24, sshr
+    calc_qpelh2 v2, v2, v26, v28, v30, v16, v18, v20, v22, v24, sshr
+    calc_qpelh  v3, v27, v29, v31, v17, v19, v21, v23, v25, sshr
+    calc_qpelh2 v4, v4, v27, v29, v31, v17, v19, v21, v23, v25, sshr
+    ld1         { v5.8h, v6.8h }, [x12], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    saddw2      v2.4s, v2.4s, v5.8h
+    saddw       v3.4s, v3.4s, v6.4h
+    saddw2      v4.4s, v4.4s, v6.8h
+    rshrn       v1.4h, v1.4s, 7
+    rshrn2      v1.8h, v2.4s, 7
+    rshrn       v2.4h, v3.4s, 7
+    rshrn2      v2.8h, v4.4s, 7
+    sqxtun      v1.8b, v1.8h
+    sqxtun2     v1.16b, v2.8h
+    st1         { v1.16b }, [x7], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v26.8h, v27.8h }, [x8], x9
+    calc_qpelh  v1, v28, v30, v16, v18, v20, v22, v24, v26, sshr
+    calc_qpelh2 v2, v2, v28, v30, v16, v18, v20, v22, v24, v26, sshr
+    calc_qpelh  v3, v29, v31, v17, v19, v21, v23, v25, v27, sshr
+    calc_qpelh2 v4, v4, v29, v31, v17, v19, v21, v23, v25, v27, sshr
+    ld1         { v5.8h, v6.8h }, [x12], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    saddw2      v2.4s, v2.4s, v5.8h
+    saddw       v3.4s, v3.4s, v6.4h
+    saddw2      v4.4s, v4.4s, v6.8h
+    rshrn       v1.4h, v1.4s, 7
+    rshrn2      v1.8h, v2.4s, 7
+    rshrn       v2.4h, v3.4s, 7
+    rshrn2      v2.8h, v4.4s, 7
+    sqxtun      v1.8b, v1.8h
+    sqxtun2     v1.16b, v2.8h
+    st1         { v1.16b }, [x7], x1
+    subs        x11, x11, 1
+    b.eq        3f
+
+    ld1         { v28.8h, v29.8h }, [x8], x9
+    calc_qpelh  v1, v30, v16, v18, v20, v22, v24, v26, v28, sshr
+    calc_qpelh2 v2, v2, v30, v16, v18, v20, v22, v24, v26, v28, sshr
+    calc_qpelh  v3, v31, v17, v19, v21, v23, v25, v27, v29, sshr
+    calc_qpelh2 v4, v4, v31, v17, v19, v21, v23, v25, v27, v29, sshr
+    ld1         { v5.8h, v6.8h }, [x12], x9 // src2
+    saddw       v1.4s, v1.4s, v5.4h
+    saddw2      v2.4s, v2.4s, v5.8h
+    saddw       v3.4s, v3.4s, v6.4h
+    saddw2      v4.4s, v4.4s, v6.8h
+    rshrn       v1.4h, v1.4s, 7
+    rshrn2      v1.8h, v2.4s, 7
+    rshrn       v2.4h, v3.4s, 7
+    rshrn2      v2.8h, v4.4s, 7
+    sqxtun      v1.8b, v1.8h
+    sqxtun2     v1.16b, v2.8h
+    st1         { v1.16b }, [x7], x1
+    subs        x11, x11, 1
+    b.ne        2b
+
+3:  add         x0, x0, 16
+    add         sp, sp, 32
+    add         x4, x4, 32
+    subs        x10, x10, 16
+    b.ne        1b
+    add         x10, x5, 7
+    lsl         x10, x10, 7
+    sub         x10, x10, x6, lsl 1 // part of first line
+    add         sp, sp, x10     // tmp_array without first line
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv24_8_neon, export=1
+    stp         xzr, x30, [sp, -16]!
+    stp         x0, x1, [sp, -16]!
+    stp         x2, x3, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    stp         x6, x7, [sp, -16]!
+    bl          ff_hevc_put_hevc_qpel_bi_hv16_8_neon
+    ldp         x6, x7, [sp], 16
+    ldp         x4, x5, [sp], 16
+    ldp         x2, x3, [sp], 16
+    ldp         x0, x1, [sp], 16
+    add         x0, x0, 16
+    add         x2, x2, 16
+    add         x4, x4, 32
+    bl          ff_hevc_put_hevc_qpel_bi_hv8_8_neon
+    ldp         xzr, x30, [sp], 16
+    ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv32_8_neon, export=1
+    add         x10, x5, 7
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    stp         x7, x30, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3, lsl 1
+    sub         x1, x1, x3
+    mov         x2, x3
+    add         x3, x5, 7
+    mov         x4, x6
+    bl          ff_hevc_put_hevc_qpel_h32_8_neon
+    ldp         x7, x30, [sp], 16
+    ldp         x4, x5, [sp], 16
+    ldp         x0, x1, [sp], 16
+    mov         x6, 32      // width
+    b           .Lqpel_bi_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv48_8_neon, export=1
+    add         x10, x5, 7
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    stp         x7, x30, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3, lsl 1
+    sub         x1, x1, x3
+    mov         x2, x3
+    add         x3, x5, 7
+    mov         x4, x6
+    bl          ff_hevc_put_hevc_qpel_h48_8_neon
+    ldp         x7, x30, [sp], 16
+    ldp         x4, x5, [sp], 16
+    ldp         x0, x1, [sp], 16
+    mov         x6, 48      // width
+    b           .Lqpel_bi_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv64_8_neon, export=1
+    add         x10, x5, 7
+    lsl         x10, x10, 7
+    sub         sp, sp, x10     // tmp_array
+    stp         x0, x1, [sp, -16]!
+    stp         x4, x5, [sp, -16]!
+    stp         x7, x30, [sp, -16]!
+    add         x0, sp, 48
+    sub         x1, x2, x3, lsl 1
+    sub         x1, x1, x3
+    mov         x2, x3
+    add         x3, x5, 7
+    mov         x4, x6
+    bl          ff_hevc_put_hevc_qpel_h64_8_neon
+    ldp         x7, x30, [sp], 16
+    ldp         x4, x5, [sp], 16
+    ldp         x0, x1, [sp], 16
+    mov         x6, 64      // width
+    b           .Lqpel_bi_hv16_loop
+endfunc
+
+
diff --git a/libavcodec/aarch64/hevcdsp_sao_8.S b/libavcodec/aarch64/hevcdsp_sao_8.S
new file mode 100644
index 0000000000..8096bb8c51
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_sao_8.S
@@ -0,0 +1,166 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+.Lstrides_src:
+    .word       1, 128 + 32, 128 + 32 + 1, 128 + 32 - 1
+
+function ff_hevc_sao_edge_filter_8_neon, export=1
+    adr         x7, .Lstrides_src
+    ldr         w4, [x7, x4, lsl 2]             // stride_src
+    ld1         { v0.8h }, [x3]                 // sao_offset_val
+    mov         v0.h[7], v0.h[0]                // reorder values
+    mov         v0.h[0], v0.h[1]
+    mov         v0.h[1], v0.h[2]
+    mov         v0.h[2], v0.h[7]
+    uzp2        v1.16b, v0.16b, v0.16b          // prepare for tbl
+    uzp1        v0.16b, v0.16b, v0.16b
+    movi        v2.16b, 2
+    mov         x10, 128+32                     // stride_src
+    cmp         x5, 16
+    b.lo        3f
+1:  sub         x8, x1, x4                      // line before
+    add         x9, x1, x4                      // line after
+    mov         x11, x6                         // height
+    mov         x12, x0                         // dst
+    mov         x13, x1                         // src
+2:  ld1         { v16.16b }, [x8], x10          // line before
+    ld1         { v20.16b }, [x13], x10         // current line
+    cmhi        v21.16b, v16.16b, v20.16b
+    cmhi        v22.16b, v20.16b, v16.16b
+    sub         v23.16b, v21.16b, v22.16b       // CMP(cur, prev)
+    ld1         { v16.16b }, [x9], x10          // line after
+    cmhi        v17.16b, v20.16b, v16.16b
+    cmhi        v18.16b, v16.16b, v20.16b
+    sub         v19.16b, v18.16b, v17.16b       // CMP(next, cur)
+    add         v24.16b, v23.16b, v19.16b
+    add         v24.16b, v24.16b, v2.16b        // idx to sao_offset_val
+    tbl         v25.16b, { v0.16b }, v24.16b
+    tbl         v26.16b, { v1.16b }, v24.16b
+    zip1        v27.16b, v25.16b, v26.16b
+    zip2        v28.16b, v25.16b, v26.16b
+    uxtl        v29.8h, v20.8b                  // sao_offset_val low 8
+    sqadd       v27.8h, v27.8h, v29.8h          // src + sao_offset_val low 8
+    uxtl2       v29.8h, v20.16b                 // sao_offset_val high 8
+    sqadd       v28.8h, v28.8h, v29.8h          // src + sao_offset_val high 8
+    sqxtun      v29.8b, v27.8h                  // av_clip lo
+    sqxtun2     v29.16b, v28.8h                 // av_clip hi
+    st1         { v29.16b }, [x12], x2
+    subs        x11, x11, 1
+    b.ne        2b
+    add         x0, x0, 16
+    add         x1, x1, 16
+    sub         x5, x5, 16
+    cmp         x5, 16
+    b.hs        1b
+
+3:  cmp         x5, 8
+    b.lo        6f
+4:  sub         x8, x1, x4                      // line before
+    add         x9, x1, x4                      // line after
+    mov         x11, x6                         // height
+    mov         x12, x0                         // dst
+    mov         x13, x1                         // src
+5:  ld1         { v16.8b }, [x8], x10           // line before
+    ld1         { v20.8b }, [x13], x10          // current line
+    cmhi        v21.8b, v16.8b, v20.8b
+    cmhi        v22.8b, v20.8b, v16.8b
+    sub         v23.8b, v21.8b, v22.8b          // CMP(cur, prev)
+    ld1         { v16.8b }, [x9], x10           // line after
+    cmhi        v17.8b, v20.8b, v16.8b
+    cmhi        v18.8b, v16.8b, v20.8b
+    sub         v19.8b, v18.8b, v17.8b          // CMP(next, cur)
+    add         v24.8b, v23.8b, v19.8b
+    add         v24.8b, v24.8b, v2.8b           // idx to sao_offset_val
+    tbl         v25.8b, { v0.16b }, v24.8b
+    tbl         v26.8b, { v1.16b }, v24.8b
+    zip1        v27.16b, v25.16b, v26.16b
+    uxtl        v29.8h, v20.8b                  // sao_offset_val low 8
+    sqadd       v27.8h, v27.8h, v29.8h          // src + sao_offset_val low 8
+    sqxtun      v29.8b, v27.8h                  // av_clip lo
+    st1         { v29.8b }, [x12], x2
+    subs        x11, x11, 1
+    b.ne        5b
+    add         x0, x0, 8
+    add         x1, x1, 8
+    sub         x5, x5, 8
+    cmp         x5, 8
+    b.hs        4b
+
+6:  sub         x8, x1, x4                      // line before
+    add         x9, x1, x4                      // line after
+7:  ld1         { v16.8b }, [x8], x10           // line before
+    ld1         { v20.8b }, [x1], x10           // current line
+    cmhi        v21.8b, v16.8b, v20.8b
+    cmhi        v22.8b, v20.8b, v16.8b
+    sub         v23.8b, v21.8b, v22.8b          // CMP(cur, prev)
+    ld1         { v16.8b }, [x9], x10           // line after
+    cmhi        v17.8b, v20.8b, v16.8b
+    cmhi        v18.8b, v16.8b, v20.8b
+    sub         v19.8b, v18.8b, v17.8b          // CMP(next, cur)
+    add         v24.8b, v23.8b, v19.8b
+    add         v24.8b, v24.8b, v2.8b           // idx to sao_offset_val
+    tbl         v25.8b, { v0.16b }, v24.8b
+    tbl         v26.8b, { v1.16b }, v24.8b
+    zip1        v27.16b, v25.16b, v26.16b
+    uxtl        v29.8h, v20.8b                  // sao_offset_val low 8
+    sqadd       v27.8h, v27.8h, v29.8h          // src + sao_offset_val low 8
+    sqxtun      v29.8b, v27.8h                  // av_clip lo
+    mov         x7, x0
+    adr         x11, 8f
+    add         x11, x11, x5, lsl 4
+    br          x11
+8:  b           9f                              // 0
+    nop
+    nop
+    nop
+    st1         { v29.b }[0], [x7]              // 1
+    b           9f
+    nop
+    nop
+    st1         { v29.h }[0], [x7]              // 2
+    b           9f
+    nop
+    nop
+    st1         { v29.h }[0], [x7], 2           // 3
+    st1         { v29.b }[2], [x7]
+    b           9f
+    nop
+    st1         { v29.s }[0], [x7]              // 4
+    b           9f
+    nop
+    nop
+    st1         { v29.s }[0], [x7], 4           // 5
+    st1         { v29.b }[4], [x7]
+    b           9f
+    nop
+    st1         { v29.s }[0], [x7], 4           // 6
+    st1         { v29.h }[2], [x7]
+    b           9f
+    nop
+    st1         { v29.s }[0], [x7], 4           // 7
+    st1         { v29.h }[2], [x7], 2
+    st1         { v29.b }[6], [x7]
+9:  add         x0, x0, x2
+    subs        x6, x6, 1
+    b.ne        7b
+
+10: ret
+endfunc
+
diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
index 957e40d5ff..c7de465f73 100644
--- a/libavcodec/hevcdsp.c
+++ b/libavcodec/hevcdsp.c
@@ -265,4 +265,6 @@  int i = 0;
         ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
     if (ARCH_MIPS)
         ff_hevc_dsp_init_mips(hevcdsp, bit_depth);
+    if (ARCH_AARCH64)
+        ff_hevc_dsp_init_aarch64(hevcdsp, bit_depth);
 }
diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
index 0ae67cba85..3a0dad5a80 100644
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -131,5 +131,6 @@  void ff_hevc_dsp_init_arm(HEVCDSPContext *c, const int bit_depth);
 void ff_hevc_dsp_init_ppc(HEVCDSPContext *c, const int bit_depth);
 void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);
 void ff_hevc_dsp_init_mips(HEVCDSPContext *c, const int bit_depth);
+void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth);
 
 #endif /* AVCODEC_HEVCDSP_H */