diff mbox series

[FFmpeg-devel,v4] avcodec/aarch64/hevc: add luma deblock NEON

Message ID 20240227113309.405627-1-jdek@itanimul.li
State Accepted
Commit 570052cd2a38200ae6aca52e817517513812ec56
Headers show
Series [FFmpeg-devel,v4] avcodec/aarch64/hevc: add luma deblock NEON | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

J. Dekker Feb. 27, 2024, 11:33 a.m. UTC
Benched using single-threaded full decode on an Ampere Altra.

Bpp Before  After  Speedup
8   73,3s   65,2s  1.124x
10  114,2s  104,0s 1.098x
12  125,8s  115,7s 1.087x

Signed-off-by: J. Dekker <jdek@itanimul.li>
---

 Slightly improved 12bit version.

 libavcodec/aarch64/hevcdsp_deblock_neon.S | 417 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  18 +
 2 files changed, 435 insertions(+)

Comments

Martin Storsjö Feb. 27, 2024, 9:56 p.m. UTC | #1
On Tue, 27 Feb 2024, J. Dekker wrote:

> Benched using single-threaded full decode on an Ampere Altra.
>
> Bpp Before  After  Speedup
> 8   73,3s   65,2s  1.124x
> 10  114,2s  104,0s 1.098x
> 12  125,8s  115,7s 1.087x
>
> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
>
> Slightly improved 12bit version.
>
> libavcodec/aarch64/hevcdsp_deblock_neon.S | 417 ++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c |  18 +
> 2 files changed, 435 insertions(+)
>
> diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S
> index 8227f65649..581056a91e 100644
> --- a/libavcodec/aarch64/hevcdsp_deblock_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S
> @@ -181,3 +181,420 @@ hevc_h_loop_filter_chroma 12
> hevc_v_loop_filter_chroma 8
> hevc_v_loop_filter_chroma 10
> hevc_v_loop_filter_chroma 12
> +
> +.macro hevc_loop_filter_luma_body bitdepth
> +function hevc_loop_filter_luma_body_\bitdepth\()_neon, export=0
> +.if \bitdepth > 8
> +        lsl             w2, w2, #(\bitdepth - 8) // beta <<= BIT_DEPTH - 8
> +.else
> +        uxtl            v0.8h, v0.8b
> +        uxtl            v1.8h, v1.8b
> +        uxtl            v2.8h, v2.8b
> +        uxtl            v3.8h, v3.8b
> +        uxtl            v4.8h, v4.8b
> +        uxtl            v5.8h, v5.8b
> +        uxtl            v6.8h, v6.8b
> +        uxtl            v7.8h, v7.8b
> +.endif
> +        ldr             w7, [x3] // tc[0]
> +        ldr             w8, [x3, #4] // tc[1]
> +        dup             v18.4h, w7
> +        dup             v19.4h, w8
> +        trn1            v18.2d, v18.2d, v19.2d
> +.if \bitdepth > 8
> +        shl             v18.8h, v18.8h, #(\bitdepth - 8)
> +.endif
> +        dup             v27.8h, w2 // beta
> +        // tc25
> +        shl             v19.8h, v18.8h, #2 // * 4
> +        add             v19.8h, v19.8h, v18.8h // (tc * 5)
> +        srshr           v19.8h, v19.8h, #1 // (tc * 5 + 1) >> 1
> +        sshr            v17.8h, v27.8h, #2 // beta2
> +
> +        ////// beta_2 check
> +        // dp0  = abs(P2  - 2 * P1  + P0)
> +        add             v22.8h, v3.8h, v1.8h
> +        shl             v23.8h, v2.8h, #1
> +        sabd            v30.8h, v22.8h, v23.8h
> +        // dq0  = abs(Q2  - 2 * Q1  + Q0)
> +        add             v21.8h, v6.8h, v4.8h
> +        shl             v26.8h, v5.8h, #1
> +        sabd            v31.8h, v21.8h, v26.8h
> +        // d0   = dp0 + dq0
> +        add             v20.8h, v30.8h, v31.8h
> +        shl             v25.8h, v20.8h, #1
> +        // (d0 << 1) < beta_2
> +        cmgt            v23.8h, v17.8h, v25.8h
> +
> +        ////// beta check
> +        // d0 + d3 < beta
> +        mov             x9, #0xFFFF00000000FFFF
> +        dup             v24.2d, x9
> +        and             v25.16b, v24.16b, v20.16b
> +        addp            v25.8h, v25.8h, v25.8h // 1+0 0+1 1+0 0+1
> +        addp            v25.4h, v25.4h, v25.4h // 1+0+0+1 1+0+0+1
> +        cmgt            v25.4h, v27.4h, v25.4h // lower/upper mask in h[0/1]
> +        mov             w9, v25.s[0]

I don't quite understand what this sequence does and/or how our data is 
laid out in our registers - we have d0 on input in v20, where's d3? An 
doesn't the "and" throw away half of the input elements here?

I see some similar patterns with the masking and handling below as well - 
I get a feeling that I don't quite understand the algorithm here, and/or 
the data layout.

> +.if \bitdepth > 8
> +        ld1             {v0.8h}, [x0], x1
> +        ld1             {v1.8h}, [x0], x1
> +        ld1             {v2.8h}, [x0], x1
> +        ld1             {v3.8h}, [x0], x1
> +        ld1             {v4.8h}, [x0], x1
> +        ld1             {v5.8h}, [x0], x1
> +        ld1             {v6.8h}, [x0], x1
> +        ld1             {v7.8h}, [x0]
> +        mov             w14, #((1 << \bitdepth) - 1)

For loads like these, we can generally save a bit by using two alternating 
registers for loading, with a double stride - see e.g. the vp9 loop 
filter implementations. But that's a micro optimization.

Other than that, this mostly looks reasaonble.

// Martin
J. Dekker Feb. 28, 2024, 8:02 a.m. UTC | #2
Martin Storsjö <martin@martin.st> writes:

> On Tue, 27 Feb 2024, J. Dekker wrote:
>
>> Benched using single-threaded full decode on an Ampere Altra.
>>
>> Bpp Before  After  Speedup
>> 8   73,3s   65,2s  1.124x
>> 10  114,2s  104,0s 1.098x
>> 12  125,8s  115,7s 1.087x
>>
>> Signed-off-by: J. Dekker <jdek@itanimul.li>
>> ---
>>
>> Slightly improved 12bit version.
>>
>> libavcodec/aarch64/hevcdsp_deblock_neon.S | 417 ++++++++++++++++++++++
>> libavcodec/aarch64/hevcdsp_init_aarch64.c |  18 +
>> 2 files changed, 435 insertions(+)
>>
>> diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S
>> index 8227f65649..581056a91e 100644
>> --- a/libavcodec/aarch64/hevcdsp_deblock_neon.S
>> +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S
>> @@ -181,3 +181,420 @@ hevc_h_loop_filter_chroma 12
>> hevc_v_loop_filter_chroma 8
>> hevc_v_loop_filter_chroma 10
>> hevc_v_loop_filter_chroma 12
>> +
>> +.macro hevc_loop_filter_luma_body bitdepth
>> +function hevc_loop_filter_luma_body_\bitdepth\()_neon, export=0
>> +.if \bitdepth > 8
>> +        lsl             w2, w2, #(\bitdepth - 8) // beta <<= BIT_DEPTH - 8
>> +.else
>> +        uxtl            v0.8h, v0.8b
>> +        uxtl            v1.8h, v1.8b
>> +        uxtl            v2.8h, v2.8b
>> +        uxtl            v3.8h, v3.8b
>> +        uxtl            v4.8h, v4.8b
>> +        uxtl            v5.8h, v5.8b
>> +        uxtl            v6.8h, v6.8b
>> +        uxtl            v7.8h, v7.8b
>> +.endif
>> +        ldr             w7, [x3] // tc[0]
>> +        ldr             w8, [x3, #4] // tc[1]
>> +        dup             v18.4h, w7
>> +        dup             v19.4h, w8
>> +        trn1            v18.2d, v18.2d, v19.2d
>> +.if \bitdepth > 8
>> +        shl             v18.8h, v18.8h, #(\bitdepth - 8)
>> +.endif
>> +        dup             v27.8h, w2 // beta
>> +        // tc25
>> +        shl             v19.8h, v18.8h, #2 // * 4
>> +        add             v19.8h, v19.8h, v18.8h // (tc * 5)
>> +        srshr           v19.8h, v19.8h, #1 // (tc * 5 + 1) >> 1
>> +        sshr            v17.8h, v27.8h, #2 // beta2
>> +
>> +        ////// beta_2 check
>> +        // dp0  = abs(P2  - 2 * P1  + P0)
>> +        add             v22.8h, v3.8h, v1.8h
>> +        shl             v23.8h, v2.8h, #1
>> +        sabd            v30.8h, v22.8h, v23.8h
>> +        // dq0  = abs(Q2  - 2 * Q1  + Q0)
>> +        add             v21.8h, v6.8h, v4.8h
>> +        shl             v26.8h, v5.8h, #1
>> +        sabd            v31.8h, v21.8h, v26.8h
>> +        // d0   = dp0 + dq0
>> +        add             v20.8h, v30.8h, v31.8h
>> +        shl             v25.8h, v20.8h, #1
>> +        // (d0 << 1) < beta_2
>> +        cmgt            v23.8h, v17.8h, v25.8h
>> +
>> +        ////// beta check
>> +        // d0 + d3 < beta
>> +        mov             x9, #0xFFFF00000000FFFF
>> +        dup             v24.2d, x9
>> +        and             v25.16b, v24.16b, v20.16b
>> +        addp            v25.8h, v25.8h, v25.8h // 1+0 0+1 1+0 0+1
>> +        addp            v25.4h, v25.4h, v25.4h // 1+0+0+1 1+0+0+1
>> +        cmgt            v25.4h, v27.4h, v25.4h // lower/upper mask in h[0/1]
>> +        mov             w9, v25.s[0]
>
> I don't quite understand what this sequence does and/or how our data is laid
> out in our registers - we have d0 on input in v20, where's d3? An doesn't the
> "and" throw away half of the input elements here?
>
> I see some similar patterns with the masking and handling below as well - I get
> a feeling that I don't quite understand the algorithm here, and/or the data
> layout.

We have d0, d1, d2, d3 for both 4 line blocks in v20, mask out d1/d2 and
use pair-wise adds to move our data around and calculate d0+d3
together. The first addp just moves elements around, the second addp
adds d0 + 0 + 0 + d3.

The we can check d0+d3 < beta and use the fact that the compare returns
either 0 or -1 and sign-extend to half the register width for a
mask. This allows us to calculate both 4 line block masks at the same
time in NEON registers.

>> +.if \bitdepth > 8
>> +        ld1             {v0.8h}, [x0], x1
>> +        ld1             {v1.8h}, [x0], x1
>> +        ld1             {v2.8h}, [x0], x1
>> +        ld1             {v3.8h}, [x0], x1
>> +        ld1             {v4.8h}, [x0], x1
>> +        ld1             {v5.8h}, [x0], x1
>> +        ld1             {v6.8h}, [x0], x1
>> +        ld1             {v7.8h}, [x0]
>> +        mov             w14, #((1 << \bitdepth) - 1)
>
> For loads like these, we can generally save a bit by using two alternating
> registers for loading, with a double stride - see e.g. the vp9 loop filter
> implementations. But that's a micro optimization.
>
> Other than that, this mostly looks reasaonble.

Will fix on push if no other comments.
Martin Storsjö Feb. 28, 2024, 8:27 a.m. UTC | #3
On Wed, 28 Feb 2024, J. Dekker wrote:

>
> Martin Storsjö <martin@martin.st> writes:
>
>> On Tue, 27 Feb 2024, J. Dekker wrote:
>>
>>> Benched using single-threaded full decode on an Ampere Altra.
>>>
>>> Bpp Before  After  Speedup
>>> 8   73,3s   65,2s  1.124x
>>> 10  114,2s  104,0s 1.098x
>>> 12  125,8s  115,7s 1.087x
>>>
>>> Signed-off-by: J. Dekker <jdek@itanimul.li>
>>> ---
>>>
>>> Slightly improved 12bit version.
>>>
>>> libavcodec/aarch64/hevcdsp_deblock_neon.S | 417 ++++++++++++++++++++++
>>> libavcodec/aarch64/hevcdsp_init_aarch64.c |  18 +
>>> 2 files changed, 435 insertions(+)
>>>
>>> diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S
>>> index 8227f65649..581056a91e 100644
>>> --- a/libavcodec/aarch64/hevcdsp_deblock_neon.S
>>> +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S
>>> @@ -181,3 +181,420 @@ hevc_h_loop_filter_chroma 12
>>> hevc_v_loop_filter_chroma 8
>>> hevc_v_loop_filter_chroma 10
>>> hevc_v_loop_filter_chroma 12
>>> +
>>> +.macro hevc_loop_filter_luma_body bitdepth
>>> +function hevc_loop_filter_luma_body_\bitdepth\()_neon, export=0
>>> +.if \bitdepth > 8
>>> +        lsl             w2, w2, #(\bitdepth - 8) // beta <<= BIT_DEPTH - 8
>>> +.else
>>> +        uxtl            v0.8h, v0.8b
>>> +        uxtl            v1.8h, v1.8b
>>> +        uxtl            v2.8h, v2.8b
>>> +        uxtl            v3.8h, v3.8b
>>> +        uxtl            v4.8h, v4.8b
>>> +        uxtl            v5.8h, v5.8b
>>> +        uxtl            v6.8h, v6.8b
>>> +        uxtl            v7.8h, v7.8b
>>> +.endif
>>> +        ldr             w7, [x3] // tc[0]
>>> +        ldr             w8, [x3, #4] // tc[1]
>>> +        dup             v18.4h, w7
>>> +        dup             v19.4h, w8
>>> +        trn1            v18.2d, v18.2d, v19.2d
>>> +.if \bitdepth > 8
>>> +        shl             v18.8h, v18.8h, #(\bitdepth - 8)
>>> +.endif
>>> +        dup             v27.8h, w2 // beta
>>> +        // tc25
>>> +        shl             v19.8h, v18.8h, #2 // * 4
>>> +        add             v19.8h, v19.8h, v18.8h // (tc * 5)
>>> +        srshr           v19.8h, v19.8h, #1 // (tc * 5 + 1) >> 1
>>> +        sshr            v17.8h, v27.8h, #2 // beta2
>>> +
>>> +        ////// beta_2 check
>>> +        // dp0  = abs(P2  - 2 * P1  + P0)
>>> +        add             v22.8h, v3.8h, v1.8h
>>> +        shl             v23.8h, v2.8h, #1
>>> +        sabd            v30.8h, v22.8h, v23.8h
>>> +        // dq0  = abs(Q2  - 2 * Q1  + Q0)
>>> +        add             v21.8h, v6.8h, v4.8h
>>> +        shl             v26.8h, v5.8h, #1
>>> +        sabd            v31.8h, v21.8h, v26.8h
>>> +        // d0   = dp0 + dq0
>>> +        add             v20.8h, v30.8h, v31.8h
>>> +        shl             v25.8h, v20.8h, #1
>>> +        // (d0 << 1) < beta_2
>>> +        cmgt            v23.8h, v17.8h, v25.8h
>>> +
>>> +        ////// beta check
>>> +        // d0 + d3 < beta
>>> +        mov             x9, #0xFFFF00000000FFFF
>>> +        dup             v24.2d, x9
>>> +        and             v25.16b, v24.16b, v20.16b
>>> +        addp            v25.8h, v25.8h, v25.8h // 1+0 0+1 1+0 0+1
>>> +        addp            v25.4h, v25.4h, v25.4h // 1+0+0+1 1+0+0+1
>>> +        cmgt            v25.4h, v27.4h, v25.4h // lower/upper mask in h[0/1]
>>> +        mov             w9, v25.s[0]
>>
>> I don't quite understand what this sequence does and/or how our data is laid
>> out in our registers - we have d0 on input in v20, where's d3? An doesn't the
>> "and" throw away half of the input elements here?
>>
>> I see some similar patterns with the masking and handling below as well - I get
>> a feeling that I don't quite understand the algorithm here, and/or the data
>> layout.
>
> We have d0, d1, d2, d3 for both 4 line blocks in v20, mask out d1/d2 and
> use pair-wise adds to move our data around and calculate d0+d3
> together. The first addp just moves elements around, the second addp
> adds d0 + 0 + 0 + d3.

Right, I guess this is the bit that was surprising. I would have expected 
to have e.g. all the d0 values for e.g. the 8 individual pixels in one 
SIMD register, and all the d3 values for all pixels in another SIMD 
register.

So as we're operating on 8 pixels in parallel, each of those 8 pixels have 
their own d0/d3 values, right? Or is this a case where we have just one 
d0/d3 value for a range of pixels?

// Martin
J. Dekker Feb. 28, 2024, 8:30 a.m. UTC | #4
Martin Storsjö <martin@martin.st> writes:

> On Wed, 28 Feb 2024, J. Dekker wrote:
>
>>
>> Martin Storsjö <martin@martin.st> writes:
>>
>>> On Tue, 27 Feb 2024, J. Dekker wrote:
>>>
>>>> Benched using single-threaded full decode on an Ampere Altra.
>>>>
>>>> Bpp Before  After  Speedup
>>>> 8   73,3s   65,2s  1.124x
>>>> 10  114,2s  104,0s 1.098x
>>>> 12  125,8s  115,7s 1.087x
>>>>
>>>> Signed-off-by: J. Dekker <jdek@itanimul.li>
>>>> ---
>>>>
>>>> Slightly improved 12bit version.
>>>>
>>>> libavcodec/aarch64/hevcdsp_deblock_neon.S | 417 ++++++++++++++++++++++
>>>> libavcodec/aarch64/hevcdsp_init_aarch64.c |  18 +
>>>> 2 files changed, 435 insertions(+)
>>>>
>>>> diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S
>>>> index 8227f65649..581056a91e 100644
>>>> --- a/libavcodec/aarch64/hevcdsp_deblock_neon.S
>>>> +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S
>>>> @@ -181,3 +181,420 @@ hevc_h_loop_filter_chroma 12
>>>> hevc_v_loop_filter_chroma 8
>>>> hevc_v_loop_filter_chroma 10
>>>> hevc_v_loop_filter_chroma 12
>>>> +
>>>> +.macro hevc_loop_filter_luma_body bitdepth
>>>> +function hevc_loop_filter_luma_body_\bitdepth\()_neon, export=0
>>>> +.if \bitdepth > 8
>>>> +        lsl             w2, w2, #(\bitdepth - 8) // beta <<= BIT_DEPTH - 8
>>>> +.else
>>>> +        uxtl            v0.8h, v0.8b
>>>> +        uxtl            v1.8h, v1.8b
>>>> +        uxtl            v2.8h, v2.8b
>>>> +        uxtl            v3.8h, v3.8b
>>>> +        uxtl            v4.8h, v4.8b
>>>> +        uxtl            v5.8h, v5.8b
>>>> +        uxtl            v6.8h, v6.8b
>>>> +        uxtl            v7.8h, v7.8b
>>>> +.endif
>>>> +        ldr             w7, [x3] // tc[0]
>>>> +        ldr             w8, [x3, #4] // tc[1]
>>>> +        dup             v18.4h, w7
>>>> +        dup             v19.4h, w8
>>>> +        trn1            v18.2d, v18.2d, v19.2d
>>>> +.if \bitdepth > 8
>>>> +        shl             v18.8h, v18.8h, #(\bitdepth - 8)
>>>> +.endif
>>>> +        dup             v27.8h, w2 // beta
>>>> +        // tc25
>>>> +        shl             v19.8h, v18.8h, #2 // * 4
>>>> +        add             v19.8h, v19.8h, v18.8h // (tc * 5)
>>>> +        srshr           v19.8h, v19.8h, #1 // (tc * 5 + 1) >> 1
>>>> +        sshr            v17.8h, v27.8h, #2 // beta2
>>>> +
>>>> +        ////// beta_2 check
>>>> +        // dp0  = abs(P2  - 2 * P1  + P0)
>>>> +        add             v22.8h, v3.8h, v1.8h
>>>> +        shl             v23.8h, v2.8h, #1
>>>> +        sabd            v30.8h, v22.8h, v23.8h
>>>> +        // dq0  = abs(Q2  - 2 * Q1  + Q0)
>>>> +        add             v21.8h, v6.8h, v4.8h
>>>> +        shl             v26.8h, v5.8h, #1
>>>> +        sabd            v31.8h, v21.8h, v26.8h
>>>> +        // d0   = dp0 + dq0
>>>> +        add             v20.8h, v30.8h, v31.8h
>>>> +        shl             v25.8h, v20.8h, #1
>>>> +        // (d0 << 1) < beta_2
>>>> +        cmgt            v23.8h, v17.8h, v25.8h
>>>> +
>>>> +        ////// beta check
>>>> +        // d0 + d3 < beta
>>>> +        mov             x9, #0xFFFF00000000FFFF
>>>> +        dup             v24.2d, x9
>>>> +        and             v25.16b, v24.16b, v20.16b
>>>> +        addp            v25.8h, v25.8h, v25.8h // 1+0 0+1 1+0 0+1
>>>> +        addp            v25.4h, v25.4h, v25.4h // 1+0+0+1 1+0+0+1
>>>> +        cmgt            v25.4h, v27.4h, v25.4h // lower/upper mask in h[0/1]
>>>> +        mov             w9, v25.s[0]
>>>
>>> I don't quite understand what this sequence does and/or how our data is laid
>>> out in our registers - we have d0 on input in v20, where's d3? An doesn't the
>>> "and" throw away half of the input elements here?
>>>
>>> I see some similar patterns with the masking and handling below as well - I get
>>> a feeling that I don't quite understand the algorithm here, and/or the data
>>> layout.
>>
>> We have d0, d1, d2, d3 for both 4 line blocks in v20, mask out d1/d2 and
>> use pair-wise adds to move our data around and calculate d0+d3
>> together. The first addp just moves elements around, the second addp
>> adds d0 + 0 + 0 + d3.
>
> Right, I guess this is the bit that was surprising. I would have expected to
> have e.g. all the d0 values for e.g. the 8 individual pixels in one SIMD
> register, and all the d3 values for all pixels in another SIMD register.
>
> So as we're operating on 8 pixels in parallel, each of those 8 pixels have
> their own d0/d3 values, right? Or is this a case where we have just one d0/d3
> value for a range of pixels?

Yes, d0/d1/d2/d3 are per 4 lines of 8 pixels, it's because d0 and d3 are
calculated within their own line, d0 from line 0, d3 from line 3. Maybe
it's more confusing since we are doing both halves of the filter at the
same time? v20 contains d0 d1 d2 d3 d0 d1 d2 d3, where the second d0 is
distinct from the first.

But essentially we're doing the same operation across the entire 8
lines, the filter just makes an overall skip decision for each block of
4 lines based on the sum of the result from line 0 and 3.
Martin Storsjö Feb. 28, 2024, 9:13 a.m. UTC | #5
On Wed, 28 Feb 2024, J. Dekker wrote:

>
> Martin Storsjö <martin@martin.st> writes:
>
>> On Wed, 28 Feb 2024, J. Dekker wrote:
>>
>>>
>>> Martin Storsjö <martin@martin.st> writes:
>>>
>>>> On Tue, 27 Feb 2024, J. Dekker wrote:
>>>>
>>>>> Benched using single-threaded full decode on an Ampere Altra.
>>>>>
>>>>> Bpp Before  After  Speedup
>>>>> 8   73,3s   65,2s  1.124x
>>>>> 10  114,2s  104,0s 1.098x
>>>>> 12  125,8s  115,7s 1.087x
>>>>>
>>>>> Signed-off-by: J. Dekker <jdek@itanimul.li>
>>>>> ---
>>>>>
>>>>> Slightly improved 12bit version.
>>>>>
>>>>> libavcodec/aarch64/hevcdsp_deblock_neon.S | 417 ++++++++++++++++++++++
>>>>> libavcodec/aarch64/hevcdsp_init_aarch64.c |  18 +
>>>>> 2 files changed, 435 insertions(+)
>>>>>
>>>>> diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S
>>>>> index 8227f65649..581056a91e 100644
>>>>> --- a/libavcodec/aarch64/hevcdsp_deblock_neon.S
>>>>> +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S
>>>>> @@ -181,3 +181,420 @@ hevc_h_loop_filter_chroma 12
>>>>> hevc_v_loop_filter_chroma 8
>>>>> hevc_v_loop_filter_chroma 10
>>>>> hevc_v_loop_filter_chroma 12
>>>>> +
>>>>> +.macro hevc_loop_filter_luma_body bitdepth
>>>>> +function hevc_loop_filter_luma_body_\bitdepth\()_neon, export=0
>>>>> +.if \bitdepth > 8
>>>>> +        lsl             w2, w2, #(\bitdepth - 8) // beta <<= BIT_DEPTH - 8
>>>>> +.else
>>>>> +        uxtl            v0.8h, v0.8b
>>>>> +        uxtl            v1.8h, v1.8b
>>>>> +        uxtl            v2.8h, v2.8b
>>>>> +        uxtl            v3.8h, v3.8b
>>>>> +        uxtl            v4.8h, v4.8b
>>>>> +        uxtl            v5.8h, v5.8b
>>>>> +        uxtl            v6.8h, v6.8b
>>>>> +        uxtl            v7.8h, v7.8b
>>>>> +.endif
>>>>> +        ldr             w7, [x3] // tc[0]
>>>>> +        ldr             w8, [x3, #4] // tc[1]
>>>>> +        dup             v18.4h, w7
>>>>> +        dup             v19.4h, w8
>>>>> +        trn1            v18.2d, v18.2d, v19.2d
>>>>> +.if \bitdepth > 8
>>>>> +        shl             v18.8h, v18.8h, #(\bitdepth - 8)
>>>>> +.endif
>>>>> +        dup             v27.8h, w2 // beta
>>>>> +        // tc25
>>>>> +        shl             v19.8h, v18.8h, #2 // * 4
>>>>> +        add             v19.8h, v19.8h, v18.8h // (tc * 5)
>>>>> +        srshr           v19.8h, v19.8h, #1 // (tc * 5 + 1) >> 1
>>>>> +        sshr            v17.8h, v27.8h, #2 // beta2
>>>>> +
>>>>> +        ////// beta_2 check
>>>>> +        // dp0  = abs(P2  - 2 * P1  + P0)
>>>>> +        add             v22.8h, v3.8h, v1.8h
>>>>> +        shl             v23.8h, v2.8h, #1
>>>>> +        sabd            v30.8h, v22.8h, v23.8h
>>>>> +        // dq0  = abs(Q2  - 2 * Q1  + Q0)
>>>>> +        add             v21.8h, v6.8h, v4.8h
>>>>> +        shl             v26.8h, v5.8h, #1
>>>>> +        sabd            v31.8h, v21.8h, v26.8h
>>>>> +        // d0   = dp0 + dq0
>>>>> +        add             v20.8h, v30.8h, v31.8h
>>>>> +        shl             v25.8h, v20.8h, #1
>>>>> +        // (d0 << 1) < beta_2
>>>>> +        cmgt            v23.8h, v17.8h, v25.8h
>>>>> +
>>>>> +        ////// beta check
>>>>> +        // d0 + d3 < beta
>>>>> +        mov             x9, #0xFFFF00000000FFFF
>>>>> +        dup             v24.2d, x9
>>>>> +        and             v25.16b, v24.16b, v20.16b
>>>>> +        addp            v25.8h, v25.8h, v25.8h // 1+0 0+1 1+0 0+1
>>>>> +        addp            v25.4h, v25.4h, v25.4h // 1+0+0+1 1+0+0+1
>>>>> +        cmgt            v25.4h, v27.4h, v25.4h // lower/upper mask in h[0/1]
>>>>> +        mov             w9, v25.s[0]
>>>>
>>>> I don't quite understand what this sequence does and/or how our data is laid
>>>> out in our registers - we have d0 on input in v20, where's d3? An doesn't the
>>>> "and" throw away half of the input elements here?
>>>>
>>>> I see some similar patterns with the masking and handling below as well - I get
>>>> a feeling that I don't quite understand the algorithm here, and/or the data
>>>> layout.
>>>
>>> We have d0, d1, d2, d3 for both 4 line blocks in v20, mask out d1/d2 and
>>> use pair-wise adds to move our data around and calculate d0+d3
>>> together. The first addp just moves elements around, the second addp
>>> adds d0 + 0 + 0 + d3.
>>
>> Right, I guess this is the bit that was surprising. I would have expected to
>> have e.g. all the d0 values for e.g. the 8 individual pixels in one SIMD
>> register, and all the d3 values for all pixels in another SIMD register.
>>
>> So as we're operating on 8 pixels in parallel, each of those 8 pixels have
>> their own d0/d3 values, right? Or is this a case where we have just one d0/d3
>> value for a range of pixels?
>
> Yes, d0/d1/d2/d3 are per 4 lines of 8 pixels, it's because d0 and d3 are
> calculated within their own line, d0 from line 0, d3 from line 3. Maybe
> it's more confusing since we are doing both halves of the filter at the
> same time? v20 contains d0 d1 d2 d3 d0 d1 d2 d3, where the second d0 is
> distinct from the first.
>
> But essentially we're doing the same operation across the entire 8
> lines, the filter just makes an overall skip decision for each block of
> 4 lines based on the sum of the result from line 0 and 3.

Ah, right, I see. I guess this makes sense then. Thanks!

Thus, no further objections to it; the optimizing of loading/storing can 
be done separately.

// Martin
J. Dekker Feb. 28, 2024, 9:17 a.m. UTC | #6
Martin Storsjö <martin@martin.st> writes:

> On Wed, 28 Feb 2024, J. Dekker wrote:
>
>>
>> Martin Storsjö <martin@martin.st> writes:
>>
>>> On Wed, 28 Feb 2024, J. Dekker wrote:
>>>
>>>>
>>>> Martin Storsjö <martin@martin.st> writes:
>>>>
>>>>> On Tue, 27 Feb 2024, J. Dekker wrote:
>>>>>
>>>>>> Benched using single-threaded full decode on an Ampere Altra.
>>>>>>
>>>>>> Bpp Before  After  Speedup
>>>>>> 8   73,3s   65,2s  1.124x
>>>>>> 10  114,2s  104,0s 1.098x
>>>>>> 12  125,8s  115,7s 1.087x
>>>>>>
>>>>>> Signed-off-by: J. Dekker <jdek@itanimul.li>
>>>>>> ---
>>>>>>
>>>>>> Slightly improved 12bit version.
>>>>>>
>>>>>> libavcodec/aarch64/hevcdsp_deblock_neon.S | 417 ++++++++++++++++++++++
>>>>>> libavcodec/aarch64/hevcdsp_init_aarch64.c |  18 +
>>>>>> 2 files changed, 435 insertions(+)
>>>>>>
>>>>>> diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S
>>>>>> b/libavcodec/aarch64/hevcdsp_deblock_neon.S
>>>>>> index 8227f65649..581056a91e 100644
>>>>>> --- a/libavcodec/aarch64/hevcdsp_deblock_neon.S
>>>>>> +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S
>>>>>> @@ -181,3 +181,420 @@ hevc_h_loop_filter_chroma 12
>>>>>> hevc_v_loop_filter_chroma 8
>>>>>> hevc_v_loop_filter_chroma 10
>>>>>> hevc_v_loop_filter_chroma 12
>>>>>> +
>>>>>> +.macro hevc_loop_filter_luma_body bitdepth
>>>>>> +function hevc_loop_filter_luma_body_\bitdepth\()_neon, export=0
>>>>>> +.if \bitdepth > 8
>>>>>> + lsl w2, w2, #(\bitdepth - 8) // beta <<= BIT_DEPTH - 8
>>>>>> +.else
>>>>>> +        uxtl            v0.8h, v0.8b
>>>>>> +        uxtl            v1.8h, v1.8b
>>>>>> +        uxtl            v2.8h, v2.8b
>>>>>> +        uxtl            v3.8h, v3.8b
>>>>>> +        uxtl            v4.8h, v4.8b
>>>>>> +        uxtl            v5.8h, v5.8b
>>>>>> +        uxtl            v6.8h, v6.8b
>>>>>> +        uxtl            v7.8h, v7.8b
>>>>>> +.endif
>>>>>> +        ldr             w7, [x3] // tc[0]
>>>>>> +        ldr             w8, [x3, #4] // tc[1]
>>>>>> +        dup             v18.4h, w7
>>>>>> +        dup             v19.4h, w8
>>>>>> +        trn1            v18.2d, v18.2d, v19.2d
>>>>>> +.if \bitdepth > 8
>>>>>> +        shl             v18.8h, v18.8h, #(\bitdepth - 8)
>>>>>> +.endif
>>>>>> +        dup             v27.8h, w2 // beta
>>>>>> +        // tc25
>>>>>> +        shl             v19.8h, v18.8h, #2 // * 4
>>>>>> +        add             v19.8h, v19.8h, v18.8h // (tc * 5)
>>>>>> +        srshr           v19.8h, v19.8h, #1 // (tc * 5 + 1) >> 1
>>>>>> +        sshr            v17.8h, v27.8h, #2 // beta2
>>>>>> +
>>>>>> +        ////// beta_2 check
>>>>>> +        // dp0  = abs(P2  - 2 * P1  + P0)
>>>>>> +        add             v22.8h, v3.8h, v1.8h
>>>>>> +        shl             v23.8h, v2.8h, #1
>>>>>> +        sabd            v30.8h, v22.8h, v23.8h
>>>>>> +        // dq0  = abs(Q2  - 2 * Q1  + Q0)
>>>>>> +        add             v21.8h, v6.8h, v4.8h
>>>>>> +        shl             v26.8h, v5.8h, #1
>>>>>> +        sabd            v31.8h, v21.8h, v26.8h
>>>>>> +        // d0   = dp0 + dq0
>>>>>> +        add             v20.8h, v30.8h, v31.8h
>>>>>> +        shl             v25.8h, v20.8h, #1
>>>>>> +        // (d0 << 1) < beta_2
>>>>>> +        cmgt            v23.8h, v17.8h, v25.8h
>>>>>> +
>>>>>> +        ////// beta check
>>>>>> +        // d0 + d3 < beta
>>>>>> +        mov             x9, #0xFFFF00000000FFFF
>>>>>> +        dup             v24.2d, x9
>>>>>> +        and             v25.16b, v24.16b, v20.16b
>>>>>> +        addp            v25.8h, v25.8h, v25.8h // 1+0 0+1 1+0 0+1
>>>>>> +        addp            v25.4h, v25.4h, v25.4h // 1+0+0+1 1+0+0+1
>>>>>> + cmgt v25.4h, v27.4h, v25.4h // lower/upper mask in h[0/1]
>>>>>> +        mov             w9, v25.s[0]
>>>>>
>>>>> I don't quite understand what this sequence does and/or how our data is
>>>>> laid
>>>>> out in our registers - we have d0 on input in v20, where's d3? An doesn't
>>>>> the
>>>>> "and" throw away half of the input elements here?
>>>>>
>>>>> I see some similar patterns with the masking and handling below as well -
>>>>> I get
>>>>> a feeling that I don't quite understand the algorithm here, and/or the
>>>>> data
>>>>> layout.
>>>>
>>>> We have d0, d1, d2, d3 for both 4 line blocks in v20, mask out d1/d2 and
>>>> use pair-wise adds to move our data around and calculate d0+d3
>>>> together. The first addp just moves elements around, the second addp
>>>> adds d0 + 0 + 0 + d3.
>>>
>>> Right, I guess this is the bit that was surprising. I would have expected to
>>> have e.g. all the d0 values for e.g. the 8 individual pixels in one SIMD
>>> register, and all the d3 values for all pixels in another SIMD register.
>>>
>>> So as we're operating on 8 pixels in parallel, each of those 8 pixels have
>>> their own d0/d3 values, right? Or is this a case where we have just one
>>> d0/d3
>>> value for a range of pixels?
>>
>> Yes, d0/d1/d2/d3 are per 4 lines of 8 pixels, it's because d0 and d3 are
>> calculated within their own line, d0 from line 0, d3 from line 3. Maybe
>> it's more confusing since we are doing both halves of the filter at the
>> same time? v20 contains d0 d1 d2 d3 d0 d1 d2 d3, where the second d0 is
>> distinct from the first.
>>
>> But essentially we're doing the same operation across the entire 8
>> lines, the filter just makes an overall skip decision for each block of
>> 4 lines based on the sum of the result from line 0 and 3.
>
> Ah, right, I see. I guess this makes sense then. Thanks!
>
> Thus, no further objections to it; the optimizing of loading/storing can be
> done separately.
>

Thanks, pushed. Will post load/store optimization.
diff mbox series

Patch

diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S
index 8227f65649..581056a91e 100644
--- a/libavcodec/aarch64/hevcdsp_deblock_neon.S
+++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S
@@ -181,3 +181,420 @@  hevc_h_loop_filter_chroma 12
 hevc_v_loop_filter_chroma 8
 hevc_v_loop_filter_chroma 10
 hevc_v_loop_filter_chroma 12
+
+.macro hevc_loop_filter_luma_body bitdepth
+function hevc_loop_filter_luma_body_\bitdepth\()_neon, export=0
+.if \bitdepth > 8
+        lsl             w2, w2, #(\bitdepth - 8) // beta <<= BIT_DEPTH - 8
+.else
+        uxtl            v0.8h, v0.8b
+        uxtl            v1.8h, v1.8b
+        uxtl            v2.8h, v2.8b
+        uxtl            v3.8h, v3.8b
+        uxtl            v4.8h, v4.8b
+        uxtl            v5.8h, v5.8b
+        uxtl            v6.8h, v6.8b
+        uxtl            v7.8h, v7.8b
+.endif
+        ldr             w7, [x3] // tc[0]
+        ldr             w8, [x3, #4] // tc[1]
+        dup             v18.4h, w7
+        dup             v19.4h, w8
+        trn1            v18.2d, v18.2d, v19.2d
+.if \bitdepth > 8
+        shl             v18.8h, v18.8h, #(\bitdepth - 8)
+.endif
+        dup             v27.8h, w2 // beta
+        // tc25
+        shl             v19.8h, v18.8h, #2 // * 4
+        add             v19.8h, v19.8h, v18.8h // (tc * 5)
+        srshr           v19.8h, v19.8h, #1 // (tc * 5 + 1) >> 1
+        sshr            v17.8h, v27.8h, #2 // beta2
+
+        ////// beta_2 check
+        // dp0  = abs(P2  - 2 * P1  + P0)
+        add             v22.8h, v3.8h, v1.8h
+        shl             v23.8h, v2.8h, #1
+        sabd            v30.8h, v22.8h, v23.8h
+        // dq0  = abs(Q2  - 2 * Q1  + Q0)
+        add             v21.8h, v6.8h, v4.8h
+        shl             v26.8h, v5.8h, #1
+        sabd            v31.8h, v21.8h, v26.8h
+        // d0   = dp0 + dq0
+        add             v20.8h, v30.8h, v31.8h
+        shl             v25.8h, v20.8h, #1
+        // (d0 << 1) < beta_2
+        cmgt            v23.8h, v17.8h, v25.8h
+
+        ////// beta check
+        // d0 + d3 < beta
+        mov             x9, #0xFFFF00000000FFFF
+        dup             v24.2d, x9
+        and             v25.16b, v24.16b, v20.16b
+        addp            v25.8h, v25.8h, v25.8h // 1+0 0+1 1+0 0+1
+        addp            v25.4h, v25.4h, v25.4h // 1+0+0+1 1+0+0+1
+        cmgt            v25.4h, v27.4h, v25.4h // lower/upper mask in h[0/1]
+        mov             w9, v25.s[0]
+        cmp             w9, #0
+        sxtl            v26.4s, v25.4h
+        sxtl            v16.2d, v26.2s // full skip mask
+        b.eq            3f // skip both blocks
+
+        // TODO: we can check the full skip mask with the weak/strong mask to
+        // potentially skip weak or strong calculation entirely if we only have one
+
+        ////// beta_3 check
+        // abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3
+        sshr            v17.8h, v17.8h, #1 // beta_3
+        sabd            v20.8h, v0.8h, v3.8h
+        saba            v20.8h, v7.8h, v4.8h
+        cmgt            v21.8h, v17.8h, v20.8h
+
+        and             v23.16b, v23.16b, v21.16b
+
+        ////// tc25 check
+        // abs(P0  -  Q0) < tc25
+        sabd            v20.8h, v3.8h, v4.8h
+        cmgt            v21.8h, v19.8h, v20.8h
+
+        and             v23.16b, v23.16b, v21.16b
+
+        ////// Generate low/high line max from lines 0/3/4/7
+        // mask out lines 2/3/5/6
+        not             v20.16b, v24.16b // 0x0000FFFFFFFF0000
+        orr             v23.16b, v23.16b, v20.16b
+
+        // generate weak/strong mask
+        uminp           v23.8h, v23.8h, v23.8h // extend to singles
+        sxtl            v23.4s, v23.4h
+        uminp           v26.4s, v23.4s, v23.4s // check lines
+        // extract to gpr
+        ext             v25.16b, v26.16b, v26.16b, #2
+        zip1            v17.4s, v26.4s, v26.4s
+        mov             w12, v25.s[0]
+        mov             w11, #0x0000FFFF
+        mov             w13, #0xFFFF0000
+        // FFFF FFFF -> strong strong
+        // FFFF 0000 -> strong weak
+        // 0000 FFFF -> weak   strong
+        // 0000 0000 -> weak   weak
+        cmp             w12, w13
+        b.hi            0f // only strong/strong, skip weak nd_p/nd_q calc
+
+        ////// weak nd_p/nd_q
+        // d0+d3
+        and             v30.16b, v30.16b, v24.16b // d0 __ __ d3 d4 __ __ d7
+        and             v31.16b, v31.16b, v24.16b
+        addp            v30.8h, v30.8h, v30.8h // [d0+__ __+d3 d4+__ __+d7] [ ... ]
+        addp            v31.8h, v31.8h, v31.8h // [d0+d3 d4+d7]
+        addp            v30.4h, v30.4h, v30.4h
+        addp            v31.4h, v31.4h, v31.4h
+
+        // ((beta + (beta >> 1)) >> 3)
+        sshr            v21.8h, v27.8h, #1
+        add             v21.8h, v21.8h, v27.8h
+        sshr            v21.8h, v21.8h, #3
+
+        // nd_p = dp0 + dp3 < ((beta + (beta >> 1)) >> 3)
+        cmgt            v30.8h, v21.8h, v30.8h
+        // nd_q = dq0 + dq3 < ((beta + (beta >> 1)) >> 3)
+        cmgt            v31.8h, v21.8h, v31.8h
+
+        sxtl            v30.4s, v30.4h
+        sxtl            v31.4s, v31.4h
+        sxtl            v28.2d, v30.2s
+        sxtl            v29.2d, v31.2s
+
+        cmp             w12, w11
+        b.lo            1f // can only be weak weak, skip strong
+
+0:      // STRONG FILTER
+
+        // P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc3, tc3);
+        add             v21.8h, v2.8h, v3.8h   // (p1 + p0
+        add             v21.8h, v4.8h, v21.8h  //     + q0)
+        shl             v21.8h, v21.8h, #1     //           * 2
+        add             v22.8h, v1.8h, v5.8h   //   (p2 + q1)
+        add             v21.8h, v22.8h, v21.8h // +
+        srshr           v21.8h, v21.8h, #3     //               >> 3
+        sub             v21.8h, v21.8h, v3.8h  //                    - p0
+
+        // P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
+
+        add             v22.8h, v1.8h, v2.8h
+        add             v23.8h, v3.8h, v4.8h
+        add             v22.8h, v22.8h, v23.8h
+        srshr           v22.8h, v22.8h, #2
+        sub             v22.8h, v22.8h, v2.8h
+
+        // P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc, tc);
+
+        add             v23.8h, v0.8h, v1.8h // p3 + p2
+        add             v24.8h, v3.8h, v4.8h // p0 + q0
+        shl             v23.8h, v23.8h, #1 // * 2
+        add             v23.8h, v23.8h, v24.8h
+        add             v24.8h, v1.8h, v2.8h // p2 + p1
+        add             v23.8h, v23.8h, v24.8h
+        srshr           v23.8h, v23.8h, #3
+        sub             v23.8h, v23.8h, v1.8h
+
+        // Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc3, tc3);
+        add             v24.8h, v3.8h, v4.8h   // (p0 + q0
+        add             v24.8h, v5.8h, v24.8h  //     + q1)
+        shl             v24.8h, v24.8h, #1     //           * 2
+        add             v25.8h, v2.8h, v6.8h   //   (p1 + q2)
+        add             v24.8h, v25.8h, v24.8h // +
+        srshr           v24.8h, v24.8h, #3     //               >> 3
+        sub             v24.8h, v24.8h, v4.8h  //                    - q0
+
+        // Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
+
+        add             v25.8h, v6.8h, v5.8h
+        add             v26.8h, v3.8h, v4.8h
+        add             v25.8h, v25.8h, v26.8h
+        srshr           v25.8h, v25.8h, #2
+        sub             v25.8h, v25.8h, v5.8h
+
+        // Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc, tc);
+
+        add             v26.8h, v7.8h, v6.8h
+        add             v27.8h, v6.8h, v5.8h
+        shl             v26.8h, v26.8h, #1
+        add             v26.8h, v26.8h, v27.8h
+        add             v27.8h, v3.8h, v4.8h
+        add             v26.8h, v26.8h, v27.8h
+        srshr           v26.8h, v26.8h, #3
+        sub             v26.8h, v26.8h, v6.8h
+
+        // this clip should work properly
+        shl             v30.8h, v18.8h, #1 // tc2
+        neg             v31.8h, v30.8h // -tc2
+        clip            v31.8h, v30.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h
+
+        and             v21.16b, v21.16b, v16.16b
+        and             v22.16b, v22.16b, v16.16b
+        and             v23.16b, v23.16b, v16.16b
+        and             v24.16b, v24.16b, v16.16b
+        and             v25.16b, v25.16b, v16.16b
+        and             v26.16b, v26.16b, v16.16b
+
+        add             v23.8h, v23.8h, v1.8h // careful
+        add             v22.8h, v22.8h, v2.8h
+        add             v21.8h, v21.8h, v3.8h
+        add             v24.8h, v24.8h, v4.8h
+        add             v25.8h, v25.8h, v5.8h
+        add             v26.8h, v26.8h, v6.8h
+
+        cmp             w12, w13
+        b.hi            2f // only strong/strong, skip weak
+
+1:      // WEAK FILTER
+
+        // delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4
+.if \bitdepth < 12
+        sub             v27.8h, v4.8h, v3.8h // q0 - p0
+        shl             v30.8h, v27.8h, #3 // * 8
+        add             v27.8h, v27.8h, v30.8h // 9 * (q0 - p0)
+
+        sub             v30.8h, v5.8h, v2.8h // q1 - p1
+        shl             v31.8h, v30.8h, #1 // * 2
+
+        sub             v27.8h, v27.8h, v31.8h
+        sub             v27.8h, v27.8h, v30.8h // - 3 * (q1 - p1)
+        srshr           v27.8h, v27.8h, #4
+.else
+        sub             v19.8h, v4.8h, v3.8h // q0 - p0
+        sub             v20.8h, v5.8h, v2.8h // q1 - p1
+
+        sshll           v30.4s, v19.4h, #3 // * 8
+        sshll2          v31.4s, v19.8h, #3
+
+        shl             v27.8h, v20.8h, #1
+
+        saddw           v30.4s, v30.4s, v19.4h // 9 * (q0 - p0)
+        saddw2          v31.4s, v31.4s, v19.8h
+
+        saddl           v19.4s, v27.4h, v20.4h // 3 * (q1 - p1)
+        saddl2          v20.4s, v27.8h, v20.8h
+
+        sub             v19.4s, v30.4s, v19.4s
+        sub             v20.4s, v31.4s, v20.4s
+
+        sqrshrn         v27.4h, v19.4s, #4
+        sqrshrn2        v27.8h, v20.4s, #4
+.endif
+
+        // delta0 10tc check mask
+        shl             v30.8h, v18.8h, #1 // * 2
+        shl             v31.8h, v18.8h, #3 // * 8
+        add             v30.8h, v30.8h, v31.8h // 10 * tc
+        abs             v31.8h, v27.8h
+        cmgt            v20.8h, v30.8h, v31.8h // abs(delta0) < 10 * tc
+
+        and             v20.16b, v20.16b, v16.16b // combine with full mask
+
+        neg             v31.8h, v18.8h // -tc
+        clip            v31.8h, v18.8h, v27.8h // delta0 = av_clip(delta0, -tc, tc)
+
+        // deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2)
+        add             v30.8h, v1.8h, v3.8h
+        srshr           v30.8h, v30.8h, #1
+        sub             v30.8h, v30.8h, v2.8h
+        add             v30.8h, v30.8h, v27.8h
+        sshr            v30.8h, v30.8h, #1
+
+        // p3 p2 p1 p0 q0 q1 q2 q3
+        // v0 v1 v2 v3 v4 v5 v6 v7
+
+        // deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
+        add             v31.8h, v6.8h, v4.8h
+        srshr           v31.8h, v31.8h, #1
+        sub             v31.8h, v31.8h, v5.8h
+        sub             v31.8h, v31.8h, v27.8h
+        sshr            v31.8h, v31.8h, #1
+
+        // apply nd_p nd_q mask to deltap1/deltaq1
+        and             v30.16b, v30.16b, v28.16b
+        and             v31.16b, v31.16b, v29.16b
+
+        // apply full skip mask to deltap1/deltaq1/delta0
+        and             v30.16b, v30.16b, v20.16b
+        and             v27.16b, v27.16b, v20.16b
+        and             v31.16b, v31.16b, v20.16b
+
+        // clip P1/Q1 to -tc_2, tc_2
+        sshr            v18.8h, v18.8h, #1 // tc2
+        neg             v28.8h, v18.8h
+        clip            v28.8h, v18.8h, v30.8h, v31.8h
+
+        // P0 = av_clip_pixel(p0 + delta0)
+        // Q0 = av_clip_pixel(q0 - delta0)
+        add             v29.8h, v3.8h, v27.8h // P0
+        sub             v27.8h, v4.8h, v27.8h // Q0
+
+        // P1 = av_clip_pixel(p1 + deltap1)
+        // Q1 = av_clip_pixel(q1 + deltaq1)
+        add             v30.8h, v2.8h, v30.8h // P1
+        add             v31.8h, v5.8h, v31.8h // Q1
+
+2:      // MIX WEAK/STRONG
+
+        mov             v19.16b, v1.16b
+        mov             v20.16b, v6.16b
+        // copy selection mask
+        mov             v1.16b, v17.16b
+        mov             v2.16b, v17.16b
+        mov             v3.16b, v17.16b
+        mov             v4.16b, v17.16b
+        mov             v5.16b, v17.16b
+        mov             v6.16b, v17.16b
+        // select
+        bsl             v1.16b, v23.16b, v19.16b // P2 strong/orig
+        bsl             v2.16b, v22.16b, v30.16b // P1 strong/weak
+        bsl             v3.16b, v21.16b, v29.16b // P0 strong/weak
+        bsl             v4.16b, v24.16b, v27.16b // Q0 strong/weak
+        bsl             v5.16b, v25.16b, v31.16b // Q1 strong/weak
+        bsl             v6.16b, v26.16b, v20.16b // Q2 strong/orig
+        // NOTE: Q3/P3 are unchanged
+
+.if \bitdepth > 8
+        movi            v19.8h, #0
+        dup             v20.8h, w14
+        clip            v19.8h, v20.8h, v1.8h, v2.8h, v3.8h, v4.8h, v5.8h, v6.8h
+.else
+        sqxtun          v0.8b, v0.8h
+        sqxtun          v1.8b, v1.8h
+        sqxtun          v2.8b, v2.8h
+        sqxtun          v3.8b, v3.8h
+        sqxtun          v4.8b, v4.8h
+        sqxtun          v5.8b, v5.8h
+        sqxtun          v6.8b, v6.8h
+        sqxtun          v7.8b, v7.8h
+.endif
+        ret
+3:      ret             x6
+endfunc
+.endm
+
+hevc_loop_filter_luma_body 8
+hevc_loop_filter_luma_body 10
+hevc_loop_filter_luma_body 12
+
+// hevc_v_loop_filter_luma(uint8_t *pix, ptrdiff_t stride, int beta, const int32_t *tc, const uint8_t *no_p, const uint8_t *no_q)
+
+.macro hevc_loop_filter_luma dir, bitdepth
+function ff_hevc_\dir\()_loop_filter_luma_\bitdepth\()_neon, export=1
+        mov             x6, x30
+.ifc \dir, v
+.if \bitdepth > 8
+        sub             x0, x0, #8
+.else
+        sub             x0, x0, #4
+.endif
+.else
+        sub             x0, x0, x1, lsl #2 // -4 * xstride
+.endif
+        mov             x10, x0
+.if \bitdepth > 8
+        ld1             {v0.8h}, [x0], x1
+        ld1             {v1.8h}, [x0], x1
+        ld1             {v2.8h}, [x0], x1
+        ld1             {v3.8h}, [x0], x1
+        ld1             {v4.8h}, [x0], x1
+        ld1             {v5.8h}, [x0], x1
+        ld1             {v6.8h}, [x0], x1
+        ld1             {v7.8h}, [x0]
+        mov             w14, #((1 << \bitdepth) - 1)
+.ifc \dir, v
+        transpose_8x8H  v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
+.endif
+.else
+        ld1             {v0.8b}, [x0], x1
+        ld1             {v1.8b}, [x0], x1
+        ld1             {v2.8b}, [x0], x1
+        ld1             {v3.8b}, [x0], x1
+        ld1             {v4.8b}, [x0], x1
+        ld1             {v5.8b}, [x0], x1
+        ld1             {v6.8b}, [x0], x1
+        ld1             {v7.8b}, [x0]
+.ifc \dir, v
+        transpose_8x8B  v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
+.endif
+.endif
+        bl              hevc_loop_filter_luma_body_\bitdepth\()_neon
+.if \bitdepth > 8
+.ifc \dir, v
+        transpose_8x8H  v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
+.endif
+        st1             {v0.8h}, [x10], x1
+        st1             {v1.8h}, [x10], x1
+        st1             {v2.8h}, [x10], x1
+        st1             {v3.8h}, [x10], x1
+        st1             {v4.8h}, [x10], x1
+        st1             {v5.8h}, [x10], x1
+        st1             {v6.8h}, [x10], x1
+        st1             {v7.8h}, [x10]
+.else
+.ifc \dir, v
+        transpose_8x8B  v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
+.endif
+        st1             {v0.8b}, [x10], x1
+        st1             {v1.8b}, [x10], x1
+        st1             {v2.8b}, [x10], x1
+        st1             {v3.8b}, [x10], x1
+        st1             {v4.8b}, [x10], x1
+        st1             {v5.8b}, [x10], x1
+        st1             {v6.8b}, [x10], x1
+        st1             {v7.8b}, [x10]
+.endif
+        ret             x6
+endfunc
+.endm
+
+hevc_loop_filter_luma h, 8
+hevc_loop_filter_luma h, 10
+hevc_loop_filter_luma h, 12
+
+hevc_loop_filter_luma v, 8
+hevc_loop_filter_luma v, 10
+hevc_loop_filter_luma v, 12
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 687b6cc5c3..04692aa98e 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -38,6 +38,18 @@  void ff_hevc_h_loop_filter_chroma_10_neon(uint8_t *_pix, ptrdiff_t _stride,
                                           const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
 void ff_hevc_h_loop_filter_chroma_12_neon(uint8_t *_pix, ptrdiff_t _stride,
                                           const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_v_loop_filter_luma_8_neon(uint8_t *_pix, ptrdiff_t _stride, int beta,
+                                          const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_v_loop_filter_luma_10_neon(uint8_t *_pix, ptrdiff_t _stride, int beta,
+                                          const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_v_loop_filter_luma_12_neon(uint8_t *_pix, ptrdiff_t _stride, int beta,
+                                          const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_h_loop_filter_luma_8_neon(uint8_t *_pix, ptrdiff_t _stride, int beta,
+                                          const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_h_loop_filter_luma_10_neon(uint8_t *_pix, ptrdiff_t _stride, int beta,
+                                          const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_h_loop_filter_luma_12_neon(uint8_t *_pix, ptrdiff_t _stride, int beta,
+                                          const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
 void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, const int16_t *coeffs,
                                      ptrdiff_t stride);
 void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, const int16_t *coeffs,
@@ -291,6 +303,8 @@  av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
     if (!have_neon(cpu_flags)) return;
 
     if (bit_depth == 8) {
+        c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_8_neon;
+        c->hevc_v_loop_filter_luma     = ff_hevc_v_loop_filter_luma_8_neon;
         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_8_neon;
         c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_8_neon;
         c->add_residual[0]             = ff_hevc_add_residual_4x4_8_neon;
@@ -379,6 +393,8 @@  av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 
     }
     if (bit_depth == 10) {
+        c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_10_neon;
+        c->hevc_v_loop_filter_luma     = ff_hevc_v_loop_filter_luma_10_neon;
         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_10_neon;
         c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_10_neon;
         c->add_residual[0]             = ff_hevc_add_residual_4x4_10_neon;
@@ -395,6 +411,8 @@  av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_10_neon;
     }
     if (bit_depth == 12) {
+        c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_12_neon;
+        c->hevc_v_loop_filter_luma     = ff_hevc_v_loop_filter_luma_12_neon;
         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_12_neon;
         c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_12_neon;
         c->add_residual[0]             = ff_hevc_add_residual_4x4_12_neon;