diff mbox series

[FFmpeg-devel,1/5] lavc/aarch64: new optimization for 8-bit hevc_pel_uni_pixels

Message ID 20230604041756.5196-1-Logan.Lyu@myais.com.cn
State New
Headers show
Series [FFmpeg-devel,1/5] lavc/aarch64: new optimization for 8-bit hevc_pel_uni_pixels | expand

Checks

Context Check Description
andriy/configure_x86 warning Failed to apply patch
yinshiyou/configure_loongarch64 warning Failed to apply patch

Commit Message

Logan.Lyu June 4, 2023, 4:17 a.m. UTC
From: Logan Lyu <Logan.Lyu@myais.com.cn>

Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 ++
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 104 ++++++++++++++++++++++
 2 files changed, 109 insertions(+)

Comments

Martin Storsjö June 12, 2023, 7:47 a.m. UTC | #1
On Sun, 4 Jun 2023, Logan.Lyu@myais.com.cn wrote:

> From: Logan Lyu <Logan.Lyu@myais.com.cn>
>
> Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
> ---
> libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 ++
> libavcodec/aarch64/hevcdsp_qpel_neon.S    | 104 ++++++++++++++++++++++
> 2 files changed, 109 insertions(+)
>
> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> index 483a9d5253..5a1d520eec 100644
> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> @@ -152,6 +152,9 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co
>     void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
>     void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
>
> +NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
> +        const uint8_t *_src, ptrdiff_t _srcstride,
> +        int height, intptr_t mx, intptr_t my, int width),);
>
> NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
>         const uint8_t *_src, ptrdiff_t _srcstride,
> @@ -263,6 +266,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
>         c->put_hevc_qpel_bi[8][0][1]   =
>         c->put_hevc_qpel_bi[9][0][1]   = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
>
> +        NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
> +        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
>         NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
>         NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
>         NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
> diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
> index ed659cfe9b..6ca05b7201 100644
> --- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
> @@ -490,6 +490,110 @@ put_hevc qpel
> put_hevc qpel_uni
> put_hevc qpel_bi
>
> +function ff_hevc_put_hevc_pel_uni_pixels4_8_neon, export=1
> +1:
> +        ldr             s0, [x2]
> +        ldr             s1, [x2, x3]
> +        add             x2, x2, x3, lsl #1
> +        str             s0, [x0]
> +        str             s1, [x0, x1]
> +        add             x0, x0, x1, lsl #1
> +        subs            w4, w4, #2
> +        b.hi            1b
> +        ret
> +endfunc

In a loop like this, I would recommend moving the "subs" instruction 
further away from the branch that depends on it. For cores with in-order 
execution, it does matter a fair bit, while it probably doesn't for cores 
with out-of-order execution. Here, the ideal location probably is after 
the two loads at the start. The same thing goes for all the other 
functions in this patch.

Other than that, this looks ok.

// Martin
Logan.Lyu June 18, 2023, 8:29 a.m. UTC | #2
Hi, Martin,

I modified it according to your comments. Please review again.

And here are the checkasm benchmark results of the related functions:

The platform I tested is the g8y instance of Alibaba Cloud, with a chip 
based on armv9.


put_hevc_pel_uni_pixels4_8_c: 35.9
put_hevc_pel_uni_pixels4_8_neon: 7.6
put_hevc_pel_uni_pixels6_8_c: 46.1
put_hevc_pel_uni_pixels6_8_neon: 20.6
put_hevc_pel_uni_pixels8_8_c: 53.4
put_hevc_pel_uni_pixels8_8_neon: 11.6
put_hevc_pel_uni_pixels12_8_c: 89.1
put_hevc_pel_uni_pixels12_8_neon: 25.9
put_hevc_pel_uni_pixels16_8_c: 106.4
put_hevc_pel_uni_pixels16_8_neon: 20.4
put_hevc_pel_uni_pixels24_8_c: 137.6
put_hevc_pel_uni_pixels24_8_neon: 47.1
put_hevc_pel_uni_pixels32_8_c: 173.6
put_hevc_pel_uni_pixels32_8_neon: 54.1
put_hevc_pel_uni_pixels48_8_c: 268.1
put_hevc_pel_uni_pixels48_8_neon: 117.1
put_hevc_pel_uni_pixels64_8_c: 346.1
put_hevc_pel_uni_pixels64_8_neon: 205.9


在 2023/6/12 15:47, Martin Storsjö 写道:
> On Sun, 4 Jun 2023, Logan.Lyu@myais.com.cn wrote:
>
>> From: Logan Lyu <Logan.Lyu@myais.com.cn>
>>
>> Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
>> ---
>> libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 ++
>> libavcodec/aarch64/hevcdsp_qpel_neon.S    | 104 ++++++++++++++++++++++
>> 2 files changed, 109 insertions(+)
>>
>> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
>> b/libavcodec/aarch64/hevcdsp_init_aarch64.c
>> index 483a9d5253..5a1d520eec 100644
>> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
>> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
>> @@ -152,6 +152,9 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t 
>> *_dst, ptrdiff_t _dststride, co
>>     void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
>>     void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
>>
>> +NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
>> +        const uint8_t *_src, ptrdiff_t _srcstride,
>> +        int height, intptr_t mx, intptr_t my, int width),);
>>
>> NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
>>         const uint8_t *_src, ptrdiff_t _srcstride,
>> @@ -263,6 +266,8 @@ av_cold void 
>> ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
>>         c->put_hevc_qpel_bi[8][0][1]   =
>>         c->put_hevc_qpel_bi[9][0][1]   = 
>> ff_hevc_put_hevc_qpel_bi_h16_8_neon;
>>
>> +        NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
>> +        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
>>         NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
>>         NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
>>         NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, 
>> qpel_uni_w_v,);
>> diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S 
>> b/libavcodec/aarch64/hevcdsp_qpel_neon.S
>> index ed659cfe9b..6ca05b7201 100644
>> --- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
>> +++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
>> @@ -490,6 +490,110 @@ put_hevc qpel
>> put_hevc qpel_uni
>> put_hevc qpel_bi
>>
>> +function ff_hevc_put_hevc_pel_uni_pixels4_8_neon, export=1
>> +1:
>> +        ldr             s0, [x2]
>> +        ldr             s1, [x2, x3]
>> +        add             x2, x2, x3, lsl #1
>> +        str             s0, [x0]
>> +        str             s1, [x0, x1]
>> +        add             x0, x0, x1, lsl #1
>> +        subs            w4, w4, #2
>> +        b.hi            1b
>> +        ret
>> +endfunc
>
> In a loop like this, I would recommend moving the "subs" instruction 
> further away from the branch that depends on it. For cores with 
> in-order execution, it does matter a fair bit, while it probably 
> doesn't for cores with out-of-order execution. Here, the ideal 
> location probably is after the two loads at the start. The same thing 
> goes for all the other functions in this patch.
>
> Other than that, this looks ok.
>
> // Martin
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
From a654b41fd8b100f631db49bd419ef65594ef32b3 Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu@myais.com.cn>
Date: Sun, 7 May 2023 16:58:30 +0800
Subject: [PATCH 1/5] lavc/aarch64: new optimization for 8-bit
 hevc_pel_uni_pixels

Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 ++
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 104 ++++++++++++++++++++++
 2 files changed, 109 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 483a9d5253..5a1d520eec 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -152,6 +152,9 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co
     void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
     void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
 
+NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, intptr_t mx, intptr_t my, int width),);
 
 NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
@@ -263,6 +266,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->put_hevc_qpel_bi[8][0][1]   =
         c->put_hevc_qpel_bi[9][0][1]   = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
 
+        NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
         NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
         NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
         NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index ed659cfe9b..ed5b5027db 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -490,6 +490,110 @@ put_hevc qpel
 put_hevc qpel_uni
 put_hevc qpel_bi
 
+function ff_hevc_put_hevc_pel_uni_pixels4_8_neon, export=1
+1:
+        ldr             s0, [x2]
+        ldr             s1, [x2, x3]
+        subs            w4, w4, #2
+        add             x2, x2, x3, lsl #1
+        str             s0, [x0]
+        str             s1, [x0, x1]
+        add             x0, x0, x1, lsl #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels6_8_neon, export=1
+        sub             x1, x1, #4
+1:
+        ldr             d0, [x2]
+        ldr             d1, [x2, x3]
+        subs            w4, w4, #2
+        add             x2, x2, x3, lsl #1
+        str             s0, [x0], #4
+        st1             {v0.h}[2], [x0], x1
+        str             s1, [x0], #4
+        st1             {v1.h}[2], [x0], x1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels8_8_neon, export=1
+1:
+        ldr             d0, [x2]
+        ldr             d1, [x2, x3]
+        subs            w4, w4, #2
+        add             x2, x2, x3, lsl #1
+        str             d0, [x0]
+        str             d1, [x0, x1]
+        add             x0, x0, x1, lsl #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels12_8_neon, export=1
+        sub             x1, x1, #8
+1:
+        ldr             q0, [x2]
+        ldr             q1, [x2, x3]
+        subs            w4, w4, #2
+        add             x2, x2, x3, lsl #1
+        str             d0, [x0], #8
+        st1             {v0.s}[2], [x0], x1
+        str             d1, [x0], #8
+        st1             {v1.s}[2], [x0], x1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels16_8_neon, export=1
+1:
+        ldr             q0, [x2]
+        ldr             q1, [x2, x3]
+        subs            w4, w4, #2
+        add             x2, x2, x3, lsl #1
+        str             q0, [x0]
+        str             q1, [x0, x1]
+        add             x0, x0, x1, lsl #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels24_8_neon, export=1
+1:
+        ld1             {v0.8b, v1.8b, v2.8b}, [x2], x3
+        subs            w4, w4, #1
+        st1             {v0.8b, v1.8b, v2.8b}, [x0], x1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels32_8_neon, export=1
+1:
+        ld1             {v0.16b, v1.16b}, [x2], x3
+        subs            w4, w4, #1
+        st1             {v0.16b, v1.16b}, [x0], x1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels48_8_neon, export=1
+1:
+        ld1             {v0.16b, v1.16b, v2.16b}, [x2], x3
+        subs            w4, w4, #1
+        st1             {v0.16b, v1.16b, v2.16b}, [x0], x1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels64_8_neon, export=1
+1:
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
+        subs            w4, w4, #1
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+        b.hi            1b
+        ret
+endfunc
 
 function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
         mov             w10, #-6
Martin Storsjö July 1, 2023, 9:16 p.m. UTC | #3
On Sun, 18 Jun 2023, Logan.Lyu wrote:

> Hi, Martin,
>
> I modified it according to your comments. Please review again.
>
> And here are the checkasm benchmark results of the related functions:
>
> The platform I tested is the g8y instance of Alibaba Cloud, with a chip based 
> on armv9.

Thanks for clarifying that. When updating patches, please include those 
benchmark numbers in the commit message, and mention the HW used for 
testing there in the commit message.

And when tweaking patches, remember to update the benchmark numbers in the 
commit message if the tweak changes the results notably.

The patchset is almost ok to be pushed, there's a couple issues left. I 
was about to just fix up the last issues myself and push them, but patch 5 
had a bit more issues than I wanted to fix silently.

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 483a9d5253..5a1d520eec 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -152,6 +152,9 @@  void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co
     void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
     void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
 
+NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, intptr_t mx, intptr_t my, int width),);
 
 NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
@@ -263,6 +266,8 @@  av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->put_hevc_qpel_bi[8][0][1]   =
         c->put_hevc_qpel_bi[9][0][1]   = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
 
+        NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
         NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
         NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
         NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index ed659cfe9b..6ca05b7201 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -490,6 +490,110 @@  put_hevc qpel
 put_hevc qpel_uni
 put_hevc qpel_bi
 
+function ff_hevc_put_hevc_pel_uni_pixels4_8_neon, export=1
+1:
+        ldr             s0, [x2]
+        ldr             s1, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        str             s0, [x0]
+        str             s1, [x0, x1]
+        add             x0, x0, x1, lsl #1
+        subs            w4, w4, #2
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels6_8_neon, export=1
+        sub             x1, x1, #4
+1:
+        ldr             d0, [x2]
+        ldr             d1, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        str             s0, [x0], #4
+        st1             {v0.h}[2], [x0], x1
+        str             s1, [x0], #4
+        st1             {v1.h}[2], [x0], x1
+        subs            w4, w4, #2
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels8_8_neon, export=1
+1:
+        ldr             d0, [x2]
+        ldr             d1, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        str             d0, [x0]
+        str             d1, [x0, x1]
+        add             x0, x0, x1, lsl #1
+        subs            w4, w4, #2
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels12_8_neon, export=1
+        sub             x1, x1, #8
+1:
+        ldr             q0, [x2]
+        ldr             q1, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        str             d0, [x0], #8
+        st1             {v0.s}[2], [x0], x1
+        str             d1, [x0], #8
+        st1             {v1.s}[2], [x0], x1
+        subs            w4, w4, #2
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels16_8_neon, export=1
+1:
+        ldr             q0, [x2]
+        ldr             q1, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        str             q0, [x0]
+        str             q1, [x0, x1]
+        add             x0, x0, x1, lsl #1
+        subs            w4, w4, #2
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels24_8_neon, export=1
+1:
+        ld1             {v0.8b, v1.8b, v2.8b}, [x2], x3
+        st1             {v0.8b, v1.8b, v2.8b}, [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels32_8_neon, export=1
+1:
+        ld1             {v0.16b, v1.16b}, [x2], x3
+        st1             {v0.16b, v1.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels48_8_neon, export=1
+1:
+        ld1             {v0.16b, v1.16b, v2.16b}, [x2], x3
+        st1             {v0.16b, v1.16b, v2.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels64_8_neon, export=1
+1:
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
 
 function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
         mov             w10, #-6