diff mbox series

[FFmpeg-devel,4/5] lavc/aarch64: new optimization for 8-bit hevc_epel_h

Message ID 20230604041756.5196-4-Logan.Lyu@myais.com.cn
State New
Headers show
Series [FFmpeg-devel,1/5] lavc/aarch64: new optimization for 8-bit hevc_pel_uni_pixels | expand

Checks

Context Check Description
andriy/configure_x86 warning Failed to apply patch
yinshiyou/configure_loongarch64 warning Failed to apply patch

Commit Message

Logan.Lyu June 4, 2023, 4:17 a.m. UTC
From: Logan Lyu <Logan.Lyu@myais.com.cn>

Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
---
 libavcodec/aarch64/hevcdsp_epel_neon.S    | 343 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   7 +-
 2 files changed, 349 insertions(+), 1 deletion(-)

Comments

Martin Storsjö June 12, 2023, 8:12 a.m. UTC | #1
On Sun, 4 Jun 2023, Logan.Lyu@myais.com.cn wrote:

> From: Logan Lyu <Logan.Lyu@myais.com.cn>
>
> Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
> ---
> libavcodec/aarch64/hevcdsp_epel_neon.S    | 343 ++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c |   7 +-
> 2 files changed, 349 insertions(+), 1 deletion(-)


> +        st2             {v20.8h, v21.8h}, [x7]
> +        subs            w3, w3, #1   // height
> +        b.ne            1b
> +        ret

In general, place the loop counter decrement somewhere else than exactly 
before the branch that depends on the result. E.g. after the initial loads 
is usually a good place, or between the st1/2 instructions and the 
instructions that calculate the final output values.

The same goes probably for all places in all these patches.

> @@ -283,13 +287,14 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
>         NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
>
>         if (have_i8mm(cpu_flags)) {
> +            NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h, _i8mm);
>             NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm);
>             NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
>             NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
>             NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv, _i8mm);
>         }
> -
>     }
> +
>     if (bit_depth == 10) {

Here are some stray unrelated whitespace changes.

Other than that, this patch looks mostly reasonable.

// Martin
Logan.Lyu June 18, 2023, 8:23 a.m. UTC | #2
Hi, Martin,

I modified it according to your comments. Please review again.

And here are the checkasm benchmark results of the related functions:

put_hevc_epel_h4_8_c: 67.1
put_hevc_epel_h4_8_i8mm: 21.1
put_hevc_epel_h6_8_c: 147.1
put_hevc_epel_h6_8_i8mm: 45.1
put_hevc_epel_h8_8_c: 237.4
put_hevc_epel_h8_8_i8mm: 72.1
put_hevc_epel_h12_8_c: 527.4
put_hevc_epel_h12_8_i8mm: 115.4
put_hevc_epel_h16_8_c: 943.6
put_hevc_epel_h16_8_i8mm: 153.9
put_hevc_epel_h24_8_c: 2105.4
put_hevc_epel_h24_8_i8mm: 384.4
put_hevc_epel_h32_8_c: 3631.4
put_hevc_epel_h32_8_i8mm: 519.9
put_hevc_epel_h48_8_c: 8082.1
put_hevc_epel_h48_8_i8mm: 1110.4
put_hevc_epel_h64_8_c: 14400.6
put_hevc_epel_h64_8_i8mm: 2057.1

put_hevc_qpel_h4_8_c: 124.9
put_hevc_qpel_h4_8_neon: 43.1
put_hevc_qpel_h4_8_i8mm: 33.1
put_hevc_qpel_h6_8_c: 269.4
put_hevc_qpel_h6_8_neon: 90.6
put_hevc_qpel_h6_8_i8mm: 61.4
put_hevc_qpel_h8_8_c: 477.6
put_hevc_qpel_h8_8_neon: 82.1
put_hevc_qpel_h8_8_i8mm: 99.9
put_hevc_qpel_h12_8_c: 1062.4
put_hevc_qpel_h12_8_neon: 226.9
put_hevc_qpel_h12_8_i8mm: 170.9
put_hevc_qpel_h16_8_c: 1880.6
put_hevc_qpel_h16_8_neon: 302.9
put_hevc_qpel_h16_8_i8mm: 251.4
put_hevc_qpel_h24_8_c: 4221.9
put_hevc_qpel_h24_8_neon: 893.9
put_hevc_qpel_h24_8_i8mm: 626.1
put_hevc_qpel_h32_8_c: 7437.6
put_hevc_qpel_h32_8_neon: 1189.9
put_hevc_qpel_h32_8_i8mm: 959.1
put_hevc_qpel_h48_8_c: 16838.4
put_hevc_qpel_h48_8_neon: 2727.9
put_hevc_qpel_h48_8_i8mm: 2163.9
put_hevc_qpel_h64_8_c: 29982.1
put_hevc_qpel_h64_8_neon: 4777.6


在 2023/6/12 16:12, Martin Storsjö 写道:
> On Sun, 4 Jun 2023, Logan.Lyu@myais.com.cn wrote:
>
>> From: Logan Lyu <Logan.Lyu@myais.com.cn>
>>
>> Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
>> ---
>> libavcodec/aarch64/hevcdsp_epel_neon.S    | 343 ++++++++++++++++++++++
>> libavcodec/aarch64/hevcdsp_init_aarch64.c |   7 +-
>> 2 files changed, 349 insertions(+), 1 deletion(-)
>
>
>> +        st2             {v20.8h, v21.8h}, [x7]
>> +        subs            w3, w3, #1   // height
>> +        b.ne            1b
>> +        ret
>
> In general, place the loop counter decrement somewhere else than 
> exactly before the branch that depends on the result. E.g. after the 
> initial loads is usually a good place, or between the st1/2 
> instructions and the instructions that calculate the final output values.
>
> The same goes probably for all places in all these patches.
>
>> @@ -283,13 +287,14 @@ av_cold void 
>> ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
>>         NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, 
>> qpel_uni_w_v,);
>>
>>         if (have_i8mm(cpu_flags)) {
>> +            NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h, _i8mm);
>>             NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h 
>> ,_i8mm);
>>             NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
>>             NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, 
>> qpel_uni_w_h, _i8mm);
>>             NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, 
>> qpel_uni_w_hv, _i8mm);
>>         }
>> -
>>     }
>> +
>>     if (bit_depth == 10) {
>
> Here are some stray unrelated whitespace changes.
>
> Other than that, this patch looks mostly reasonable.
>
> // Martin
>
Logan.Lyu June 18, 2023, 8:26 a.m. UTC | #3
Add missing patch attachment...

在 2023/6/18 16:23, Logan.Lyu 写道:
> Hi, Martin,
>
> I modified it according to your comments. Please review again.
>
> And here are the checkasm benchmark results of the related functions:
>
> put_hevc_epel_h4_8_c: 67.1
> put_hevc_epel_h4_8_i8mm: 21.1
> put_hevc_epel_h6_8_c: 147.1
> put_hevc_epel_h6_8_i8mm: 45.1
> put_hevc_epel_h8_8_c: 237.4
> put_hevc_epel_h8_8_i8mm: 72.1
> put_hevc_epel_h12_8_c: 527.4
> put_hevc_epel_h12_8_i8mm: 115.4
> put_hevc_epel_h16_8_c: 943.6
> put_hevc_epel_h16_8_i8mm: 153.9
> put_hevc_epel_h24_8_c: 2105.4
> put_hevc_epel_h24_8_i8mm: 384.4
> put_hevc_epel_h32_8_c: 3631.4
> put_hevc_epel_h32_8_i8mm: 519.9
> put_hevc_epel_h48_8_c: 8082.1
> put_hevc_epel_h48_8_i8mm: 1110.4
> put_hevc_epel_h64_8_c: 14400.6
> put_hevc_epel_h64_8_i8mm: 2057.1
>
> put_hevc_qpel_h4_8_c: 124.9
> put_hevc_qpel_h4_8_neon: 43.1
> put_hevc_qpel_h4_8_i8mm: 33.1
> put_hevc_qpel_h6_8_c: 269.4
> put_hevc_qpel_h6_8_neon: 90.6
> put_hevc_qpel_h6_8_i8mm: 61.4
> put_hevc_qpel_h8_8_c: 477.6
> put_hevc_qpel_h8_8_neon: 82.1
> put_hevc_qpel_h8_8_i8mm: 99.9
> put_hevc_qpel_h12_8_c: 1062.4
> put_hevc_qpel_h12_8_neon: 226.9
> put_hevc_qpel_h12_8_i8mm: 170.9
> put_hevc_qpel_h16_8_c: 1880.6
> put_hevc_qpel_h16_8_neon: 302.9
> put_hevc_qpel_h16_8_i8mm: 251.4
> put_hevc_qpel_h24_8_c: 4221.9
> put_hevc_qpel_h24_8_neon: 893.9
> put_hevc_qpel_h24_8_i8mm: 626.1
> put_hevc_qpel_h32_8_c: 7437.6
> put_hevc_qpel_h32_8_neon: 1189.9
> put_hevc_qpel_h32_8_i8mm: 959.1
> put_hevc_qpel_h48_8_c: 16838.4
> put_hevc_qpel_h48_8_neon: 2727.9
> put_hevc_qpel_h48_8_i8mm: 2163.9
> put_hevc_qpel_h64_8_c: 29982.1
> put_hevc_qpel_h64_8_neon: 4777.6
>
>
> 在 2023/6/12 16:12, Martin Storsjö 写道:
>> On Sun, 4 Jun 2023, Logan.Lyu@myais.com.cn wrote:
>>
>>> From: Logan Lyu <Logan.Lyu@myais.com.cn>
>>>
>>> Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
>>> ---
>>> libavcodec/aarch64/hevcdsp_epel_neon.S    | 343 ++++++++++++++++++++++
>>> libavcodec/aarch64/hevcdsp_init_aarch64.c |   7 +-
>>> 2 files changed, 349 insertions(+), 1 deletion(-)
>>
>>
>>> +        st2             {v20.8h, v21.8h}, [x7]
>>> +        subs            w3, w3, #1   // height
>>> +        b.ne            1b
>>> +        ret
>>
>> In general, place the loop counter decrement somewhere else than 
>> exactly before the branch that depends on the result. E.g. after the 
>> initial loads is usually a good place, or between the st1/2 
>> instructions and the instructions that calculate the final output 
>> values.
>>
>> The same goes probably for all places in all these patches.
>>
>>> @@ -283,13 +287,14 @@ av_cold void 
>>> ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
>>>         NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, 
>>> qpel_uni_w_v,);
>>>
>>>         if (have_i8mm(cpu_flags)) {
>>> +            NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h, _i8mm);
>>>             NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, 
>>> epel_uni_w_h ,_i8mm);
>>>             NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
>>>             NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, 
>>> qpel_uni_w_h, _i8mm);
>>> NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, 
>>> qpel_uni_w_hv, _i8mm);
>>>         }
>>> -
>>>     }
>>> +
>>>     if (bit_depth == 10) {
>>
>> Here are some stray unrelated whitespace changes.
>>
>> Other than that, this patch looks mostly reasonable.
>>
>> // Martin
>>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
From e5432a25ce05cb9c47e8bcd345d1ab0c1133c82b Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu@myais.com.cn>
Date: Sun, 28 May 2023 10:30:28 +0800
Subject: [PATCH 4/5] lavc/aarch64: new optimization for 8-bit hevc_epel_h

Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
---
 libavcodec/aarch64/hevcdsp_epel_neon.S    | 343 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
 2 files changed, 348 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
index ca37ce1786..8b6f396a0b 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -33,6 +33,349 @@ const epel_filters, align=4
 endconst
 
 #if HAVE_I8MM
+
+.macro EPEL_H_HEADER
+        movrel          x5, epel_filters
+        add             x5, x5, x4, lsl #2
+        ld1r            {v30.4s}, [x5]
+        sub             x1, x1, #1
+        mov             x10, #(MAX_PB_SIZE * 2)
+.endm
+
+function ff_hevc_put_hevc_epel_h4_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v4.8b}, [x1], x2
+        subs            w3, w3, #1   // height
+        ext             v5.8b, v4.8b, v4.8b, #1
+        ext             v6.8b, v4.8b, v4.8b, #2
+        ext             v7.8b, v4.8b, v4.8b, #3
+        trn1            v4.2s, v4.2s, v5.2s
+        trn1            v6.2s, v6.2s, v7.2s
+        trn1            v4.2d, v4.2d, v6.2d
+        movi            v16.2d, #0
+        usdot           v16.4s, v4.16b, v30.16b
+        xtn             v16.4h, v16.4s
+        st1             {v16.4h}, [x0], x10
+        b.ne            1b
+        ret
+endfunc
+
+
+function ff_hevc_put_hevc_epel_h6_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v4.16b},  [x1], x2
+        subs            w3, w3, #1   // height
+        ext             v5.16b, v4.16b, v4.16b, #1
+        ext             v6.8b, v4.8b, v4.8b, #2
+        ext             v7.8b, v4.8b, v4.8b, #3
+        trn1            v16.2s, v4.2s, v5.2s
+        trn2            v17.2s, v4.2s, v5.2s
+        trn1            v6.2s, v6.2s, v7.2s
+        trn1            v16.2d, v16.2d, v6.2d
+        movi            v18.2d, #0
+        movi            v19.2d, #0
+        usdot           v18.4s, v16.16b, v30.16b
+        usdot           v19.2s, v17.8b, v30.8b
+        xtn             v18.4h, v18.4s
+        xtn             v19.4h, v19.4s
+        str             d18, [x0]
+        str             s19, [x0, #8]
+        add             x0, x0, x10
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h8_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v4.16b}, [x1], x2
+        subs            w3, w3, #1   // height
+        ext             v5.16b, v4.16b, v4.16b, #1
+        ext             v6.16b, v4.16b, v4.16b, #2
+        ext             v7.16b, v4.16b, v4.16b, #3
+        zip1            v20.4s, v4.4s, v6.4s
+        zip1            v21.4s, v5.4s, v7.4s
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        usdot           v16.4s, v20.16b, v30.16b
+        usdot           v17.4s, v21.16b, v30.16b
+        xtn             v16.4h, v16.4s
+        xtn             v17.4h, v17.4s
+        st2             {v16.4h, v17.4h}, [x0], x10
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h12_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v4.16b}, [x1], x2
+        subs            w3, w3, #1   // height
+        ext             v5.16b, v4.16b, v4.16b, #1
+        ext             v6.16b, v4.16b, v4.16b, #2
+        ext             v7.16b, v4.16b, v4.16b, #3
+        trn1            v20.2d, v4.2d, v6.2d
+        trn2            v22.2d, v4.2d, v6.2d
+        trn1            v21.2d, v5.2d, v7.2d
+        trn2            v23.2d, v5.2d, v7.2d
+        trn1            v4.4s, v20.4s, v21.4s
+        trn2            v5.4s, v20.4s, v21.4s
+        trn1            v6.4s, v22.4s, v23.4s
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        usdot           v16.4s, v4.16b, v30.16b
+        usdot           v17.4s, v5.16b, v30.16b
+        usdot           v18.4s, v6.16b, v30.16b
+        xtn             v16.4h, v16.4s
+        xtn2            v16.8h, v17.4s
+        xtn             v18.4h, v18.4s
+        str             q16, [x0]
+        str             d18, [x0, #16]
+        add             x0, x0, x10
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h16_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v0.16b, v1.16b}, [x1], x2
+        subs            w3, w3, #1   // height
+        ext             v5.16b, v0.16b, v1.16b, #1
+        ext             v6.16b, v0.16b, v1.16b, #2
+        ext             v7.16b, v0.16b, v1.16b, #3
+        zip1            v20.4s, v0.4s, v6.4s
+        zip2            v22.4s, v0.4s, v6.4s
+        zip1            v21.4s, v5.4s, v7.4s
+        zip2            v23.4s, v5.4s, v7.4s
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        movi            v19.2d, #0
+        usdot           v16.4s, v20.16b, v30.16b
+        usdot           v17.4s, v21.16b, v30.16b
+        usdot           v18.4s, v22.16b, v30.16b
+        usdot           v19.4s, v23.16b, v30.16b
+        xtn             v16.4h, v16.4s
+        xtn2            v16.8h, v18.4s
+        xtn             v17.4h, v17.4s
+        xtn2            v17.8h, v19.4s
+        st2             {v16.8h, v17.8h}, [x0], x10
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h24_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v0.16b, v1.16b}, [x1], x2
+        subs            w3, w3, #1   // height
+        ext             v5.16b, v0.16b, v1.16b, #1
+        ext             v6.16b, v0.16b, v1.16b, #2
+        ext             v7.16b, v0.16b, v1.16b, #3
+        ext             v26.16b, v1.16b, v1.16b, #1
+        ext             v27.16b, v1.16b, v1.16b, #2
+        ext             v28.16b, v1.16b, v1.16b, #3
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        movi            v19.2d, #0
+        movi            v20.2d, #0
+        movi            v21.2d, #0
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+        usdot           v16.4s, v0.16b, v30.16b
+        usdot           v17.4s, v5.16b, v30.16b
+        usdot           v18.4s, v6.16b, v30.16b
+        usdot           v19.4s, v7.16b, v30.16b
+        usdot           v20.4s, v1.16b, v30.16b
+        usdot           v21.4s, v26.16b, v30.16b
+        usdot           v22.4s, v27.16b, v30.16b
+        usdot           v23.4s, v28.16b, v30.16b
+        xtn             v16.4h, v16.4s
+        xtn2            v16.8h, v20.4s
+        xtn             v17.4h, v17.4s
+        xtn2            v17.8h, v21.4s
+        xtn             v18.4h, v18.4s
+        xtn2            v18.8h, v22.4s
+        xtn             v19.4h, v19.4s
+        xtn2            v19.8h, v23.4s
+        zip1            v20.8h, v16.8h, v18.8h
+        zip1            v21.8h, v17.8h, v19.8h
+        zip2            v22.8h, v16.8h, v18.8h
+        zip2            v23.8h, v17.8h, v19.8h
+        zip1            v22.8h, v22.8h, v23.8h
+        add             x7, x0, #32
+        st2             {v20.8h, v21.8h}, [x0], x10
+        st1             {v22.8h}, [x7]
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h32_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v0.16b, v1.16b, v2.16b}, [x1], x2
+        subs            w3, w3, #1   // height
+        ext             v5.16b, v0.16b, v1.16b, #1
+        ext             v6.16b, v0.16b, v1.16b, #2
+        ext             v7.16b, v0.16b, v1.16b, #3
+        ext             v26.16b, v1.16b, v2.16b, #1
+        ext             v27.16b, v1.16b, v2.16b, #2
+        ext             v28.16b, v1.16b, v2.16b, #3
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        movi            v19.2d, #0
+        movi            v20.2d, #0
+        movi            v21.2d, #0
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+        usdot           v16.4s, v0.16b, v30.16b
+        usdot           v17.4s, v5.16b, v30.16b
+        usdot           v18.4s, v6.16b, v30.16b
+        usdot           v19.4s, v7.16b, v30.16b
+        usdot           v20.4s, v1.16b, v30.16b
+        usdot           v21.4s, v26.16b, v30.16b
+        usdot           v22.4s, v27.16b, v30.16b
+        usdot           v23.4s, v28.16b, v30.16b
+        xtn             v16.4h, v16.4s
+        xtn2            v16.8h, v20.4s
+        xtn             v17.4h, v17.4s
+        xtn2            v17.8h, v21.4s
+        xtn             v18.4h, v18.4s
+        xtn2            v18.8h, v22.4s
+        xtn             v19.4h, v19.4s
+        xtn2            v19.8h, v23.4s
+        st4             {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x10
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h48_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
+        subs            w3, w3, #1   // height
+        ext             v4.16b, v0.16b, v1.16b, #1
+        ext             v5.16b, v0.16b, v1.16b, #2
+        ext             v6.16b, v0.16b, v1.16b, #3
+        ext             v16.16b, v1.16b, v2.16b, #1
+        ext             v17.16b, v1.16b, v2.16b, #2
+        ext             v18.16b, v1.16b, v2.16b, #3
+        movi            v20.2d, #0
+        movi            v21.2d, #0
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+        usdot           v20.4s, v0.16b, v30.16b
+        usdot           v21.4s, v4.16b, v30.16b
+        usdot           v22.4s, v5.16b, v30.16b
+        usdot           v23.4s, v6.16b, v30.16b
+        movi            v24.2d, #0
+        movi            v25.2d, #0
+        movi            v26.2d, #0
+        movi            v27.2d, #0
+        usdot           v24.4s, v1.16b, v30.16b
+        usdot           v25.4s, v16.16b, v30.16b
+        usdot           v26.4s, v17.16b, v30.16b
+        usdot           v27.4s, v18.16b, v30.16b
+        xtn             v20.4h, v20.4s
+        xtn2            v20.8h, v24.4s
+        xtn             v21.4h, v21.4s
+        xtn2            v21.8h, v25.4s
+        xtn             v22.4h, v22.4s
+        xtn2            v22.8h, v26.4s
+        xtn             v23.4h, v23.4s
+        xtn2            v23.8h, v27.4s
+        st4             {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], x10
+        ext             v4.16b, v2.16b, v3.16b, #1
+        ext             v5.16b, v2.16b, v3.16b, #2
+        ext             v6.16b, v2.16b, v3.16b, #3
+        movi            v20.2d, #0
+        movi            v21.2d, #0
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+        usdot           v20.4s, v2.16b, v30.16b
+        usdot           v21.4s, v4.16b, v30.16b
+        usdot           v22.4s, v5.16b, v30.16b
+        usdot           v23.4s, v6.16b, v30.16b
+        xtn             v20.4h, v20.4s
+        xtn2            v20.8h, v22.4s
+        xtn             v21.4h, v21.4s
+        xtn2            v21.8h, v23.4s
+        add             x7, x0, #64
+        st2             {v20.8h, v21.8h}, [x7]
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+        sub             x2, x2, #64
+1:      ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
+        subs            w3, w3, #1   // height
+        ext             v4.16b, v0.16b, v1.16b, #1
+        ext             v5.16b, v0.16b, v1.16b, #2
+        ext             v6.16b, v0.16b, v1.16b, #3
+        ext             v16.16b, v1.16b, v2.16b, #1
+        ext             v17.16b, v1.16b, v2.16b, #2
+        ext             v18.16b, v1.16b, v2.16b, #3
+        movi            v20.2d, #0
+        movi            v21.2d, #0
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+        usdot           v20.4s, v0.16b, v30.16b
+        usdot           v21.4s, v4.16b, v30.16b
+        usdot           v22.4s, v5.16b, v30.16b
+        usdot           v23.4s, v6.16b, v30.16b
+        movi            v24.2d, #0
+        movi            v25.2d, #0
+        movi            v26.2d, #0
+        movi            v27.2d, #0
+        usdot           v24.4s, v1.16b, v30.16b
+        usdot           v25.4s, v16.16b, v30.16b
+        usdot           v26.4s, v17.16b, v30.16b
+        usdot           v27.4s, v18.16b, v30.16b
+        xtn             v20.4h, v20.4s
+        xtn2            v20.8h, v24.4s
+        xtn             v21.4h, v21.4s
+        xtn2            v21.8h, v25.4s
+        xtn             v22.4h, v22.4s
+        xtn2            v22.8h, v26.4s
+        xtn             v23.4h, v23.4s
+        xtn2            v23.8h, v27.4s
+        st4             {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+        ld1             {v7.8b}, [x1], x2
+        ext             v4.16b, v2.16b, v3.16b, #1
+        ext             v5.16b, v2.16b, v3.16b, #2
+        ext             v6.16b, v2.16b, v3.16b, #3
+        ext             v16.16b, v3.16b, v7.16b, #1
+        ext             v17.16b, v3.16b, v7.16b, #2
+        ext             v18.16b, v3.16b, v7.16b, #3
+        movi            v20.2d, #0
+        movi            v21.2d, #0
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+        usdot           v20.4s, v2.16b, v30.16b
+        usdot           v21.4s, v4.16b, v30.16b
+        usdot           v22.4s, v5.16b, v30.16b
+        usdot           v23.4s, v6.16b, v30.16b
+        movi            v24.2d, #0
+        movi            v25.2d, #0
+        movi            v26.2d, #0
+        movi            v27.2d, #0
+        usdot           v24.4s, v3.16b, v30.16b
+        usdot           v25.4s, v16.16b, v30.16b
+        usdot           v26.4s, v17.16b, v30.16b
+        usdot           v27.4s, v18.16b, v30.16b
+        xtn             v20.4h, v20.4s
+        xtn2            v20.8h, v24.4s
+        xtn             v21.4h, v21.4s
+        xtn2            v21.8h, v25.4s
+        xtn             v22.4h, v22.4s
+        xtn2            v22.8h, v26.4s
+        xtn             v23.4h, v23.4s
+        xtn2            v23.8h, v27.4s
+        st4             {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+        b.ne            1b
+        ret
+endfunc
+
 .macro EPEL_UNI_W_H_HEADER
         ldr             x12, [sp]
         sub             x2, x2, #1
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 4a260e1d9a..b448d755b9 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -171,6 +171,10 @@ NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
         int height, int denom, int wx, int ox,
         intptr_t mx, intptr_t my, int width),);
 
+NEON8_FNPROTO(epel_h, (int16_t *dst,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, intptr_t mx, intptr_t my, int width), _i8mm);
+
 NEON8_FNPROTO(epel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, int denom, int wx, int ox,
@@ -283,6 +287,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
 
         if (have_i8mm(cpu_flags)) {
+            NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
diff mbox series

Patch

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
index 4841f49dab..32f052a7b1 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -33,6 +33,349 @@  const epel_filters, align=4
 endconst
 
 #if HAVE_I8MM
+
+.macro EPEL_H_HEADER
+        movrel          x5, epel_filters
+        add             x5, x5, x4, lsl #2
+        ld1r            {v30.4s}, [x5]
+        sub             x1, x1, #1
+        mov             x10, #(MAX_PB_SIZE * 2)
+.endm
+
+function ff_hevc_put_hevc_epel_h4_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v4.8b}, [x1], x2
+        ext             v5.8b, v4.8b, v4.8b, #1
+        ext             v6.8b, v4.8b, v4.8b, #2
+        ext             v7.8b, v4.8b, v4.8b, #3
+        trn1            v4.2s, v4.2s, v5.2s
+        trn1            v6.2s, v6.2s, v7.2s
+        trn1            v4.2d, v4.2d, v6.2d
+        movi            v16.2d, #0
+        usdot           v16.4s, v4.16b, v30.16b
+        xtn             v16.4h, v16.4s
+        st1             {v16.4h}, [x0], x10
+        subs            w3, w3, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+
+function ff_hevc_put_hevc_epel_h6_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v4.16b},  [x1], x2
+        ext             v5.16b, v4.16b, v4.16b, #1
+        ext             v6.8b, v4.8b, v4.8b, #2
+        ext             v7.8b, v4.8b, v4.8b, #3
+        trn1            v16.2s, v4.2s, v5.2s
+        trn2            v17.2s, v4.2s, v5.2s
+        trn1            v6.2s, v6.2s, v7.2s
+        trn1            v16.2d, v16.2d, v6.2d
+        movi            v18.2d, #0
+        movi            v19.2d, #0
+        usdot           v18.4s, v16.16b, v30.16b
+        usdot           v19.2s, v17.8b, v30.8b
+        xtn             v18.4h, v18.4s
+        xtn             v19.4h, v19.4s
+        str             d18, [x0]
+        str             s19, [x0, #8]
+        add             x0, x0, x10
+        subs            w3, w3, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h8_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v4.16b}, [x1], x2
+        ext             v5.16b, v4.16b, v4.16b, #1
+        ext             v6.16b, v4.16b, v4.16b, #2
+        ext             v7.16b, v4.16b, v4.16b, #3
+        zip1            v20.4s, v4.4s, v6.4s
+        zip1            v21.4s, v5.4s, v7.4s
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        usdot           v16.4s, v20.16b, v30.16b
+        usdot           v17.4s, v21.16b, v30.16b
+        xtn             v16.4h, v16.4s
+        xtn             v17.4h, v17.4s
+        st2             {v16.4h, v17.4h}, [x0], x10
+        subs            w3, w3, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h12_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v4.16b}, [x1], x2
+        ext             v5.16b, v4.16b, v4.16b, #1
+        ext             v6.16b, v4.16b, v4.16b, #2
+        ext             v7.16b, v4.16b, v4.16b, #3
+        trn1            v20.2d, v4.2d, v6.2d
+        trn2            v22.2d, v4.2d, v6.2d
+        trn1            v21.2d, v5.2d, v7.2d
+        trn2            v23.2d, v5.2d, v7.2d
+        trn1            v4.4s, v20.4s, v21.4s
+        trn2            v5.4s, v20.4s, v21.4s
+        trn1            v6.4s, v22.4s, v23.4s
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        usdot           v16.4s, v4.16b, v30.16b
+        usdot           v17.4s, v5.16b, v30.16b
+        usdot           v18.4s, v6.16b, v30.16b
+        xtn             v16.4h, v16.4s
+        xtn2            v16.8h, v17.4s
+        xtn             v18.4h, v18.4s
+        str             q16, [x0]
+        str             d18, [x0, #16]
+        add             x0, x0, x10
+        subs            w3, w3, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h16_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v0.16b, v1.16b}, [x1], x2
+        ext             v5.16b, v0.16b, v1.16b, #1
+        ext             v6.16b, v0.16b, v1.16b, #2
+        ext             v7.16b, v0.16b, v1.16b, #3
+        zip1            v20.4s, v0.4s, v6.4s
+        zip2            v22.4s, v0.4s, v6.4s
+        zip1            v21.4s, v5.4s, v7.4s
+        zip2            v23.4s, v5.4s, v7.4s
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        movi            v19.2d, #0
+        usdot           v16.4s, v20.16b, v30.16b
+        usdot           v17.4s, v21.16b, v30.16b
+        usdot           v18.4s, v22.16b, v30.16b
+        usdot           v19.4s, v23.16b, v30.16b
+        xtn             v16.4h, v16.4s
+        xtn2            v16.8h, v18.4s
+        xtn             v17.4h, v17.4s
+        xtn2            v17.8h, v19.4s
+        st2             {v16.8h, v17.8h}, [x0], x10
+        subs            w3, w3, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h24_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v0.16b, v1.16b}, [x1], x2
+        ext             v5.16b, v0.16b, v1.16b, #1
+        ext             v6.16b, v0.16b, v1.16b, #2
+        ext             v7.16b, v0.16b, v1.16b, #3
+        ext             v26.16b, v1.16b, v1.16b, #1
+        ext             v27.16b, v1.16b, v1.16b, #2
+        ext             v28.16b, v1.16b, v1.16b, #3
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        movi            v19.2d, #0
+        movi            v20.2d, #0
+        movi            v21.2d, #0
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+        usdot           v16.4s, v0.16b, v30.16b
+        usdot           v17.4s, v5.16b, v30.16b
+        usdot           v18.4s, v6.16b, v30.16b
+        usdot           v19.4s, v7.16b, v30.16b
+        usdot           v20.4s, v1.16b, v30.16b
+        usdot           v21.4s, v26.16b, v30.16b
+        usdot           v22.4s, v27.16b, v30.16b
+        usdot           v23.4s, v28.16b, v30.16b
+        xtn             v16.4h, v16.4s
+        xtn2            v16.8h, v20.4s
+        xtn             v17.4h, v17.4s
+        xtn2            v17.8h, v21.4s
+        xtn             v18.4h, v18.4s
+        xtn2            v18.8h, v22.4s
+        xtn             v19.4h, v19.4s
+        xtn2            v19.8h, v23.4s
+        zip1            v20.8h, v16.8h, v18.8h
+        zip1            v21.8h, v17.8h, v19.8h
+        zip2            v22.8h, v16.8h, v18.8h
+        zip2            v23.8h, v17.8h, v19.8h
+        zip1            v22.8h, v22.8h, v23.8h
+        add             x7, x0, #32
+        st2             {v20.8h, v21.8h}, [x0], x10
+        st1             {v22.8h}, [x7]
+        subs            w3, w3, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h32_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v0.16b, v1.16b, v2.16b}, [x1], x2
+        ext             v5.16b, v0.16b, v1.16b, #1
+        ext             v6.16b, v0.16b, v1.16b, #2
+        ext             v7.16b, v0.16b, v1.16b, #3
+        ext             v26.16b, v1.16b, v2.16b, #1
+        ext             v27.16b, v1.16b, v2.16b, #2
+        ext             v28.16b, v1.16b, v2.16b, #3
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        movi            v19.2d, #0
+        movi            v20.2d, #0
+        movi            v21.2d, #0
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+        usdot           v16.4s, v0.16b, v30.16b
+        usdot           v17.4s, v5.16b, v30.16b
+        usdot           v18.4s, v6.16b, v30.16b
+        usdot           v19.4s, v7.16b, v30.16b
+        usdot           v20.4s, v1.16b, v30.16b
+        usdot           v21.4s, v26.16b, v30.16b
+        usdot           v22.4s, v27.16b, v30.16b
+        usdot           v23.4s, v28.16b, v30.16b
+        xtn             v16.4h, v16.4s
+        xtn2            v16.8h, v20.4s
+        xtn             v17.4h, v17.4s
+        xtn2            v17.8h, v21.4s
+        xtn             v18.4h, v18.4s
+        xtn2            v18.8h, v22.4s
+        xtn             v19.4h, v19.4s
+        xtn2            v19.8h, v23.4s
+        st4             {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x10
+        subs            w3, w3, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h48_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
+        ext             v4.16b, v0.16b, v1.16b, #1
+        ext             v5.16b, v0.16b, v1.16b, #2
+        ext             v6.16b, v0.16b, v1.16b, #3
+        ext             v16.16b, v1.16b, v2.16b, #1
+        ext             v17.16b, v1.16b, v2.16b, #2
+        ext             v18.16b, v1.16b, v2.16b, #3
+        movi            v20.2d, #0
+        movi            v21.2d, #0
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+        usdot           v20.4s, v0.16b, v30.16b
+        usdot           v21.4s, v4.16b, v30.16b
+        usdot           v22.4s, v5.16b, v30.16b
+        usdot           v23.4s, v6.16b, v30.16b
+        movi            v24.2d, #0
+        movi            v25.2d, #0
+        movi            v26.2d, #0
+        movi            v27.2d, #0
+        usdot           v24.4s, v1.16b, v30.16b
+        usdot           v25.4s, v16.16b, v30.16b
+        usdot           v26.4s, v17.16b, v30.16b
+        usdot           v27.4s, v18.16b, v30.16b
+        xtn             v20.4h, v20.4s
+        xtn2            v20.8h, v24.4s
+        xtn             v21.4h, v21.4s
+        xtn2            v21.8h, v25.4s
+        xtn             v22.4h, v22.4s
+        xtn2            v22.8h, v26.4s
+        xtn             v23.4h, v23.4s
+        xtn2            v23.8h, v27.4s
+        st4             {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], x10
+        ext             v4.16b, v2.16b, v3.16b, #1
+        ext             v5.16b, v2.16b, v3.16b, #2
+        ext             v6.16b, v2.16b, v3.16b, #3
+        movi            v20.2d, #0
+        movi            v21.2d, #0
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+        usdot           v20.4s, v2.16b, v30.16b
+        usdot           v21.4s, v4.16b, v30.16b
+        usdot           v22.4s, v5.16b, v30.16b
+        usdot           v23.4s, v6.16b, v30.16b
+        xtn             v20.4h, v20.4s
+        xtn2            v20.8h, v22.4s
+        xtn             v21.4h, v21.4s
+        xtn2            v21.8h, v23.4s
+        add             x7, x0, #64
+        st2             {v20.8h, v21.8h}, [x7]
+        subs            w3, w3, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+        sub             x2, x2, #64
+1:      ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
+        ext             v4.16b, v0.16b, v1.16b, #1
+        ext             v5.16b, v0.16b, v1.16b, #2
+        ext             v6.16b, v0.16b, v1.16b, #3
+        ext             v16.16b, v1.16b, v2.16b, #1
+        ext             v17.16b, v1.16b, v2.16b, #2
+        ext             v18.16b, v1.16b, v2.16b, #3
+        movi            v20.2d, #0
+        movi            v21.2d, #0
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+        usdot           v20.4s, v0.16b, v30.16b
+        usdot           v21.4s, v4.16b, v30.16b
+        usdot           v22.4s, v5.16b, v30.16b
+        usdot           v23.4s, v6.16b, v30.16b
+        movi            v24.2d, #0
+        movi            v25.2d, #0
+        movi            v26.2d, #0
+        movi            v27.2d, #0
+        usdot           v24.4s, v1.16b, v30.16b
+        usdot           v25.4s, v16.16b, v30.16b
+        usdot           v26.4s, v17.16b, v30.16b
+        usdot           v27.4s, v18.16b, v30.16b
+        xtn             v20.4h, v20.4s
+        xtn2            v20.8h, v24.4s
+        xtn             v21.4h, v21.4s
+        xtn2            v21.8h, v25.4s
+        xtn             v22.4h, v22.4s
+        xtn2            v22.8h, v26.4s
+        xtn             v23.4h, v23.4s
+        xtn2            v23.8h, v27.4s
+        st4             {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+        ld1             {v7.8b}, [x1], x2
+        ext             v4.16b, v2.16b, v3.16b, #1
+        ext             v5.16b, v2.16b, v3.16b, #2
+        ext             v6.16b, v2.16b, v3.16b, #3
+        ext             v16.16b, v3.16b, v7.16b, #1
+        ext             v17.16b, v3.16b, v7.16b, #2
+        ext             v18.16b, v3.16b, v7.16b, #3
+        movi            v20.2d, #0
+        movi            v21.2d, #0
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+        usdot           v20.4s, v2.16b, v30.16b
+        usdot           v21.4s, v4.16b, v30.16b
+        usdot           v22.4s, v5.16b, v30.16b
+        usdot           v23.4s, v6.16b, v30.16b
+        movi            v24.2d, #0
+        movi            v25.2d, #0
+        movi            v26.2d, #0
+        movi            v27.2d, #0
+        usdot           v24.4s, v3.16b, v30.16b
+        usdot           v25.4s, v16.16b, v30.16b
+        usdot           v26.4s, v17.16b, v30.16b
+        usdot           v27.4s, v18.16b, v30.16b
+        xtn             v20.4h, v20.4s
+        xtn2            v20.8h, v24.4s
+        xtn             v21.4h, v21.4s
+        xtn2            v21.8h, v25.4s
+        xtn             v22.4h, v22.4s
+        xtn2            v22.8h, v26.4s
+        xtn             v23.4h, v23.4s
+        xtn2            v23.8h, v27.4s
+        st4             {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+        subs            w3, w3, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
 .macro EPEL_UNI_W_H_HEADER
         ldr             x12, [sp]
         sub             x2, x2, #1
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 4a260e1d9a..348497bbbe 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -171,6 +171,10 @@  NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
         int height, int denom, int wx, int ox,
         intptr_t mx, intptr_t my, int width),);
 
+NEON8_FNPROTO(epel_h, (int16_t *dst,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, intptr_t mx, intptr_t my, int width), _i8mm);
+
 NEON8_FNPROTO(epel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, int denom, int wx, int ox,
@@ -283,13 +287,14 @@  av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
 
         if (have_i8mm(cpu_flags)) {
+            NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
             NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv, _i8mm);
         }
-
     }
+
     if (bit_depth == 10) {
         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_10_neon;
         c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_10_neon;