diff mbox series

[FFmpeg-devel,1/6] lavc/aarch64: new optimization for 8-bit hevc_pel_bi_pixels

Message ID 01e3c77f-56a3-4191-9637-df9999df694c@myais.com.cn
State New
Headers show
Series [FFmpeg-devel,1/6] lavc/aarch64: new optimization for 8-bit hevc_pel_bi_pixels | expand

Checks

Context Check Description
yinshiyou/configure_loongarch64 warning Failed to apply patch
andriy/configure_x86 warning Failed to apply patch

Commit Message

Logan.Lyu Nov. 18, 2023, 2:06 a.m. UTC
put_hevc_pel_bi_pixels4_8_c: 54.7
put_hevc_pel_bi_pixels4_8_neon: 43.0
put_hevc_pel_bi_pixels6_8_c: 94.7
put_hevc_pel_bi_pixels6_8_neon: 37.0
put_hevc_pel_bi_pixels8_8_c: 171.0
put_hevc_pel_bi_pixels8_8_neon: 24.0
put_hevc_pel_bi_pixels12_8_c: 354.0
put_hevc_pel_bi_pixels12_8_neon: 68.7
put_hevc_pel_bi_pixels16_8_c: 588.2
put_hevc_pel_bi_pixels16_8_neon: 77.5
put_hevc_pel_bi_pixels24_8_c: 1670.7
put_hevc_pel_bi_pixels24_8_neon: 173.0
put_hevc_pel_bi_pixels32_8_c: 2267.7
put_hevc_pel_bi_pixels32_8_neon: 281.2
put_hevc_pel_bi_pixels48_8_c: 5787.5
put_hevc_pel_bi_pixels48_8_neon: 673.5
put_hevc_pel_bi_pixels64_8_c: 9897.0
put_hevc_pel_bi_pixels64_8_neon: 1159.5

Co-Authored-By: J. Dekker <jdek@itanimul.li>
Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
---
  libavcodec/aarch64/hevcdsp_epel_neon.S    | 179 ++++++++++++++++++++++
  libavcodec/aarch64/hevcdsp_init_aarch64.c |  10 +-
  2 files changed, 187 insertions(+), 2 deletions(-)

          NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,);
          NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels,);
          NEON8_FNASSIGN(c->put_hevc_qpel, 1, 0, qpel_v,);
+        NEON8_FNASSIGN(c->put_hevc_epel_bi, 0, 0, pel_bi_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_qpel_bi, 0, 0, pel_bi_pixels,);
          NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
          NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
          NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);

Comments

Michael Niedermayer Nov. 19, 2023, 8:42 p.m. UTC | #1
On Sat, Nov 18, 2023 at 10:06:37AM +0800, Logan.Lyu wrote:
> put_hevc_pel_bi_pixels4_8_c: 54.7
> put_hevc_pel_bi_pixels4_8_neon: 43.0
> put_hevc_pel_bi_pixels6_8_c: 94.7
> put_hevc_pel_bi_pixels6_8_neon: 37.0
> put_hevc_pel_bi_pixels8_8_c: 171.0
> put_hevc_pel_bi_pixels8_8_neon: 24.0
> put_hevc_pel_bi_pixels12_8_c: 354.0
> put_hevc_pel_bi_pixels12_8_neon: 68.7
> put_hevc_pel_bi_pixels16_8_c: 588.2
> put_hevc_pel_bi_pixels16_8_neon: 77.5
> put_hevc_pel_bi_pixels24_8_c: 1670.7
> put_hevc_pel_bi_pixels24_8_neon: 173.0
> put_hevc_pel_bi_pixels32_8_c: 2267.7
> put_hevc_pel_bi_pixels32_8_neon: 281.2
> put_hevc_pel_bi_pixels48_8_c: 5787.5
> put_hevc_pel_bi_pixels48_8_neon: 673.5
> put_hevc_pel_bi_pixels64_8_c: 9897.0
> put_hevc_pel_bi_pixels64_8_neon: 1159.5
> 
> Co-Authored-By: J. Dekker <jdek@itanimul.li>
> Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
> ---
>  libavcodec/aarch64/hevcdsp_epel_neon.S    | 179 ++++++++++++++++++++++
>  libavcodec/aarch64/hevcdsp_init_aarch64.c |  10 +-
>  2 files changed, 187 insertions(+), 2 deletions(-)
> 
> diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S
> b/libavcodec/aarch64/hevcdsp_epel_neon.S
> index 708b903b00..74165273d7 100644
> --- a/libavcodec/aarch64/hevcdsp_epel_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
> @@ -244,6 +244,185 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon,
> export=1

error: corrupt patch at line 194

[...]
Logan.Lyu Nov. 22, 2023, 12:30 p.m. UTC | #2
Hi Michael,

Thank you for reply.

I can't reproduce the error you mentioned...

I can apply patches to the ffmpeg master branch normally using 'git 
apply xxx.patch/.eml' or 'git am xxx.patch/.eml'.

By the way, I create these patches by /'git format-patch -s -o "../" 
--add-header "X-Unsent: 1" --suffix .eml --to ffmpeg-devel@ffmpeg.org -6 
--filename-max-length=100' /to generate .eml file.  And /'git 
format-patch  -4 --filename-max-length=100' /to generate .patch file as 
attachment to prevent encoding format problems in email files. These 
command should be OK, right?

Can you try these patches again? If the error still occurs, please tell 
me how it occurred then I will fixed it.


在 2023/11/20 4:42, Michael Niedermayer 写道:
> On Sat, Nov 18, 2023 at 10:06:37AM +0800, Logan.Lyu wrote:
>> put_hevc_pel_bi_pixels4_8_c: 54.7
>> put_hevc_pel_bi_pixels4_8_neon: 43.0
>> put_hevc_pel_bi_pixels6_8_c: 94.7
>> put_hevc_pel_bi_pixels6_8_neon: 37.0
>> put_hevc_pel_bi_pixels8_8_c: 171.0
>> put_hevc_pel_bi_pixels8_8_neon: 24.0
>> put_hevc_pel_bi_pixels12_8_c: 354.0
>> put_hevc_pel_bi_pixels12_8_neon: 68.7
>> put_hevc_pel_bi_pixels16_8_c: 588.2
>> put_hevc_pel_bi_pixels16_8_neon: 77.5
>> put_hevc_pel_bi_pixels24_8_c: 1670.7
>> put_hevc_pel_bi_pixels24_8_neon: 173.0
>> put_hevc_pel_bi_pixels32_8_c: 2267.7
>> put_hevc_pel_bi_pixels32_8_neon: 281.2
>> put_hevc_pel_bi_pixels48_8_c: 5787.5
>> put_hevc_pel_bi_pixels48_8_neon: 673.5
>> put_hevc_pel_bi_pixels64_8_c: 9897.0
>> put_hevc_pel_bi_pixels64_8_neon: 1159.5
>>
>> Co-Authored-By: J. Dekker<jdek@itanimul.li>
>> Signed-off-by: Logan Lyu<Logan.Lyu@myais.com.cn>
>> ---
>>   libavcodec/aarch64/hevcdsp_epel_neon.S    | 179 ++++++++++++++++++++++
>>   libavcodec/aarch64/hevcdsp_init_aarch64.c |  10 +-
>>   2 files changed, 187 insertions(+), 2 deletions(-)
>>
>> diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S
>> b/libavcodec/aarch64/hevcdsp_epel_neon.S
>> index 708b903b00..74165273d7 100644
>> --- a/libavcodec/aarch64/hevcdsp_epel_neon.S
>> +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
>> @@ -244,6 +244,185 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon,
>> export=1
> error: corrupt patch at line 194
>
> [...]
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org  with subject "unsubscribe".
Martin Storsjö Nov. 22, 2023, 12:36 p.m. UTC | #3
On Wed, 22 Nov 2023, Logan.Lyu wrote:

> I can't reproduce the error you mentioned...
>
> I can apply patches to the ffmpeg master branch normally using 'git 
> apply xxx.patch/.eml' or 'git am xxx.patch/.eml'.

I guess you have emails that haven't been through the complete delivery 
chain, while some later party might have rewritten things.

> By the way, I create these patches by /'git format-patch -s -o "../" 
> --add-header "X-Unsent: 1" --suffix .eml --to ffmpeg-devel@ffmpeg.org -6 
> --filename-max-length=100' /to generate .eml file.  And /'git 
> format-patch  -4 --filename-max-length=100' /to generate .patch file as 
> attachment to prevent encoding format problems in email files. These 
> command should be OK, right?

That's right for generating the patch files. I guess the problem here 
isn't about the patch files themselves, but the email delivery of them.

> Can you try these patches again? If the error still occurs, please tell 
> me how it occurred then I will fixed it.

I also tried applying them now with "git am", and I got this:

Applying: lavc/aarch64: new optimization for 8-bit hevc_pel_bi_pixels
error: corrupt patch at line 194
Patch failed at 0001 lavc/aarch64: new optimization for 8-bit hevc_pel_bi_pixels
hint: Use 'git am --show-current-patch=diff' to see the failed patch

Thus, same issue here.

In earlier iterations, the patches have arrived correctly when you have 
sent them as attachments, instead of when sending them as plain inline 
patches with git send-email.

// Martin
Logan.Lyu Nov. 24, 2023, 12:19 p.m. UTC | #4
Hi, Martin,

This is indeed a weird mistake... I don't know how to fixed it...

And  can you try those .patch files I attached in every email?  I 
downloaded the attachment from the ffmpeg-devel mailing list I 
subscribed to and applied it, and it seems to work.

If the method mentioned above still doesn't work, should I re-send the 
.eml files one by one?  Please tell me how to deal with it, I will be 
grateful.

Thanks


在 2023/11/22 20:36, Martin Storsjö via ffmpeg-devel 写道:
> On Wed, 22 Nov 2023, Logan.Lyu wrote:
>
>> I can't reproduce the error you mentioned...
>>
>> I can apply patches to the ffmpeg master branch normally using 'git 
>> apply xxx.patch/.eml' or 'git am xxx.patch/.eml'.
>
> I guess you have emails that haven't been through the complete 
> delivery chain, while some later party might have rewritten things.
>
>> By the way, I create these patches by /'git format-patch -s -o "../" 
>> --add-header "X-Unsent: 1" --suffix .eml --to ffmpeg-devel@ffmpeg.org 
>> -6 --filename-max-length=100' /to generate .eml file.  And /'git 
>> format-patch  -4 --filename-max-length=100' /to generate .patch file 
>> as attachment to prevent encoding format problems in email files. 
>> These command should be OK, right?
>
> That's right for generating the patch files. I guess the problem here 
> isn't about the patch files themselves, but the email delivery of them.
>
>> Can you try these patches again? If the error still occurs, please 
>> tell me how it occurred then I will fixed it.
>
> I also tried applying them now with "git am", and I got this:
>
> Applying: lavc/aarch64: new optimization for 8-bit hevc_pel_bi_pixels
> error: corrupt patch at line 194
> Patch failed at 0001 lavc/aarch64: new optimization for 8-bit 
> hevc_pel_bi_pixels
> hint: Use 'git am --show-current-patch=diff' to see the failed patch
>
> Thus, same issue here.
>
> In earlier iterations, the patches have arrived correctly when you 
> have sent them as attachments, instead of when sending them as plain 
> inline patches with git send-email.
>
> // Martin
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Martin Storsjö Nov. 24, 2023, 12:30 p.m. UTC | #5
Hi Logan,

On Fri, 24 Nov 2023, Logan.Lyu wrote:

> And  can you try those .patch files I attached in every email?  I downloaded 
> the attachment from the ffmpeg-devel mailing list I subscribed to and applied 
> it, and it seems to work.

Oh, I see - I didn't notice the attached patch. As the patch also was sent 
inline, I only tried to apply the full mail as a patch (which had 
corrupted whitespace somewhere), but the attached patch files did work. 
Now I've successfully applied your patchset, so I can start testing and 
reviewing it when I get time for it. Thanks!

// Martin
Martin Storsjö Dec. 1, 2023, 6:09 p.m. UTC | #6
On Sat, 18 Nov 2023, Logan.Lyu wrote:

> diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
> b/libavcodec/aarch64/hevcdsp_epel_neon.S
> index 708b903b00..74165273d7 100644
> --- a/libavcodec/aarch64/hevcdsp_epel_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
> @@ -244,6 +244,185 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
> endfunc
>  +function ff_hevc_put_hevc_pel_bi_pixels4_8_neon, export=1
> +        mov             x10, #(MAX_PB_SIZE * 2)
> +1:      ld1             {v0.s}[0], [x2], x3 // src
> +        ushll           v16.8h, v0.8b, #6
> +        ld1             {v20.4h}, [x4], x10 // src2
> +        sqadd           v16.8h, v16.8h, v20.8h
> +        sqrshrun        v0.8b,  v16.8h, #7
> +        st1             {v0.s}[0], [x0], x1
> +        subs            w5, w5, #1
> +        b.ne            1b

In many of these functions, the "subs" instruction could be scheduled 
better, either after the ld1, or between sqrshrun and st1. It probably 
doesn't matter much, but if you have access to an in-order core, you might 
gain a cycle per iteration here.

> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
> b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> index c51488275c..cf171023e7 100644
> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> @@ -156,8 +156,12 @@ NEON8_FNPROTO(pel_pixels, (int16_t *dst,
>         const uint8_t *src, ptrdiff_t srcstride,
>         int height, intptr_t mx, intptr_t my, int width),);
> -NEON8_FNPROTO(epel_v, (int16_t *dst,
> -        const uint8_t *src, ptrdiff_t srcstride,
> +NEON8_FNPROTO(pel_bi_pixels, (uint8_t *dst, ptrdiff_t dststride,
> +        const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2,
> +        int height, intptr_t mx, intptr_t my, int width),);
> +
> +NEON8_FNPROTO(epel_v, (uint8_t *dst, ptrdiff_t dststride,
> +        const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2,

Here, you're breaking the interface of the existing prototypes for epel_v. 
Depending on compiler, this either causes warnings, or with modern Clang, 
errors. Please pay attention to potential warnings in the file you edit, 
when authoring a new patch.

// Martin
Martin Storsjö Dec. 1, 2023, 7:29 p.m. UTC | #7
On Fri, 24 Nov 2023, Martin Storsjö wrote:

> Hi Logan,
>
> On Fri, 24 Nov 2023, Logan.Lyu wrote:
>
>> And  can you try those .patch files I attached in every email?  I 
>> downloaded the attachment from the ffmpeg-devel mailing list I subscribed 
>> to and applied it, and it seems to work.
>
> Oh, I see - I didn't notice the attached patch. As the patch also was sent 
> inline, I only tried to apply the full mail as a patch (which had corrupted 
> whitespace somewhere), but the attached patch files did work. Now I've 
> successfully applied your patchset, so I can start testing and reviewing it 
> when I get time for it. Thanks!

I had a look at the patchset now, it looked mostly good.

I fixed most of the trivial issues I noticed, and pushed with that.

Please have a look at the comments I made, in particular, the weird

+        mov             x8, #32
+        str             x8, [sp, #-80]!

in patch 5/6 is left untouched as is. Feel free to propose follow-up 
patches to fix that. I also didn't try to tweak the scheduling of e.g. the 
subs instructions.

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
b/libavcodec/aarch64/hevcdsp_epel_neon.S
index 708b903b00..74165273d7 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -244,6 +244,185 @@  function ff_hevc_put_hevc_pel_pixels64_8_neon, 
export=1
  endfunc
   +function ff_hevc_put_hevc_pel_bi_pixels4_8_neon, export=1
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1             {v0.s}[0], [x2], x3 // src
+        ushll           v16.8h, v0.8b, #6
+        ld1             {v20.4h}, [x4], x10 // src2
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqrshrun        v0.8b,  v16.8h, #7
+        st1             {v0.s}[0], [x0], x1
+        subs            w5, w5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels6_8_neon, export=1
+        mov             x10, #(MAX_PB_SIZE * 2)
+        sub             x1, x1, #4
+1:      ld1             {v0.8b}, [x2], x3
+        ushll           v16.8h, v0.8b, #6
+        ld1             {v20.8h}, [x4], x10
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqrshrun        v0.8b,  v16.8h, #7
+        st1             {v0.s}[0], [x0], #4
+        st1             {v0.h}[2], [x0], x1
+        subs            w5, w5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels8_8_neon, export=1
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1             {v0.8b}, [x2], x3    // src
+        ushll           v16.8h, v0.8b, #6
+        ld1             {v20.8h}, [x4], x10  // src2
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqrshrun        v0.8b,  v16.8h, #7
+        subs            w5, w5, #1
+        st1             {v0.8b}, [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels12_8_neon, export=1
+        mov             x10, #(MAX_PB_SIZE * 2)
+        sub             x1, x1, #8
+1:      ld1             {v0.16b}, [x2], x3
+        ushll           v16.8h, v0.8b, #6
+        ushll2          v17.8h, v0.16b, #6
+        ld1             {v20.8h, v21.8h}, [x4], x10
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqadd           v17.8h, v17.8h, v21.8h
+        sqrshrun        v0.8b,  v16.8h, #7
+        sqrshrun2       v0.16b, v17.8h, #7
+        st1             {v0.8b}, [x0], #8
+        subs            w5, w5, #1
+        st1             {v0.s}[2], [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels16_8_neon, export=1
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1             {v0.16b}, [x2], x3 // src
+        ushll           v16.8h, v0.8b, #6
+        ushll2          v17.8h, v0.16b, #6
+        ld1             {v20.8h, v21.8h}, [x4], x10  // src2
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqadd           v17.8h, v17.8h, v21.8h
+        sqrshrun        v0.8b,  v16.8h, #7
+        sqrshrun2       v0.16b, v17.8h, #7
+        subs            w5, w5, #1
+        st1             {v0.16b}, [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels24_8_neon, export=1
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1             {v0.8b-v2.8b}, [x2], x3 // src
+        ushll           v16.8h, v0.8b, #6
+        ushll           v17.8h, v1.8b, #6
+        ushll           v18.8h, v2.8b, #6
+        ld1             {v20.8h-v22.8h}, [x4], x10  // src2
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqadd           v17.8h, v17.8h, v21.8h
+        sqadd           v18.8h, v18.8h, v22.8h
+        sqrshrun        v0.8b, v16.8h, #7
+        sqrshrun        v1.8b, v17.8h, #7
+        sqrshrun        v2.8b, v18.8h, #7
+        subs            w5, w5, #1
+        st1             {v0.8b-v2.8b}, [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels32_8_neon, export=1
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1             {v0.16b-v1.16b}, [x2], x3 // src
+        ushll           v16.8h, v0.8b, #6
+        ushll2          v17.8h, v0.16b, #6
+        ushll           v18.8h, v1.8b, #6
+        ushll2          v19.8h, v1.16b, #6
+        ld1             {v20.8h-v23.8h}, [x4], x10  // src2
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqadd           v17.8h, v17.8h, v21.8h
+        sqadd           v18.8h, v18.8h, v22.8h
+        sqadd           v19.8h, v19.8h, v23.8h
+        sqrshrun        v0.8b,  v16.8h, #7
+        sqrshrun2       v0.16b, v17.8h, #7
+        sqrshrun        v1.8b,  v18.8h, #7
+        sqrshrun2       v1.16b, v19.8h, #7
+        st1             {v0.16b-v1.16b}, [x0], x1
+        subs            w5, w5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels48_8_neon, export=1
+        mov             x10, #(MAX_PB_SIZE)
+1:      ld1             {v0.16b-v2.16b}, [x2], x3 // src
+        ushll           v16.8h, v0.8b, #6
+        ushll2          v17.8h, v0.16b, #6
+        ushll           v18.8h, v1.8b, #6
+        ushll2          v19.8h, v1.16b, #6
+        ushll           v20.8h, v2.8b, #6
+        ushll2          v21.8h, v2.16b, #6
+        ld1             {v24.8h-v27.8h}, [x4], #(MAX_PB_SIZE) // src2
+        sqadd           v16.8h, v16.8h, v24.8h
+        sqadd           v17.8h, v17.8h, v25.8h
+        sqadd           v18.8h, v18.8h, v26.8h
+        sqadd           v19.8h, v19.8h, v27.8h
+        ld1             {v24.8h-v25.8h}, [x4], x10
+        sqadd           v20.8h, v20.8h, v24.8h
+        sqadd           v21.8h, v21.8h, v25.8h
+        sqrshrun        v0.8b, v16.8h, #7
+        sqrshrun2       v0.16b, v17.8h, #7
+        sqrshrun        v1.8b, v18.8h, #7
+        sqrshrun2       v1.16b, v19.8h, #7
+        sqrshrun        v2.8b, v20.8h, #7
+        sqrshrun2       v2.16b, v21.8h, #7
+        subs            w5, w5, #1
+        st1             {v0.16b-v2.16b}, [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels64_8_neon, export=1
+1:      ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3 // src
+        ushll           v16.8h, v0.8b, #6
+        ushll2          v17.8h, v0.16b, #6
+        ushll           v18.8h, v1.8b, #6
+        ushll2          v19.8h, v1.16b, #6
+        ushll           v20.8h, v2.8b, #6
+        ushll2          v21.8h, v2.16b, #6
+        ushll           v22.8h, v3.8b, #6
+        ushll2          v23.8h, v3.16b, #6
+        ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], 
#(MAX_PB_SIZE) // src2
+        sqadd           v16.8h, v16.8h, v24.8h
+        sqadd           v17.8h, v17.8h, v25.8h
+        sqadd           v18.8h, v18.8h, v26.8h
+        sqadd           v19.8h, v19.8h, v27.8h
+        ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], 
#(MAX_PB_SIZE)
+        sqadd           v20.8h, v20.8h, v24.8h
+        sqadd           v21.8h, v21.8h, v25.8h
+        sqadd           v22.8h, v22.8h, v26.8h
+        sqadd           v23.8h, v23.8h, v27.8h
+        sqrshrun        v0.8b, v16.8h, #7
+        sqrshrun2       v0.16b, v17.8h, #7
+        sqrshrun        v1.8b, v18.8h, #7
+        sqrshrun2       v1.16b, v19.8h, #7
+        sqrshrun        v2.8b, v20.8h, #7
+        sqrshrun2       v2.16b, v21.8h, #7
+        sqrshrun        v3.8b, v22.8h, #7
+        sqrshrun2       v3.16b, v23.8h, #7
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+        subs            w5, w5, #1
+        b.ne            1b
+        ret
+endfunc
+
  function ff_hevc_put_hevc_epel_v4_8_neon, export=1
          load_epel_filterb x5, x4
          sub             x1, x1, x2
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index c51488275c..cf171023e7 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -156,8 +156,12 @@  NEON8_FNPROTO(pel_pixels, (int16_t *dst,
          const uint8_t *src, ptrdiff_t srcstride,
          int height, intptr_t mx, intptr_t my, int width),);
  -NEON8_FNPROTO(epel_v, (int16_t *dst,
-        const uint8_t *src, ptrdiff_t srcstride,
+NEON8_FNPROTO(pel_bi_pixels, (uint8_t *dst, ptrdiff_t dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2,
+        int height, intptr_t mx, intptr_t my, int width),);
+
+NEON8_FNPROTO(epel_v, (uint8_t *dst, ptrdiff_t dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2,
          int height, intptr_t mx, intptr_t my, int width),);
   NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
@@ -324,6 +328,8 @@  av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext 
*c, const int bit_depth)