Message ID | 20240227113309.405627-1-jdek@itanimul.li |
---|---|
State | Accepted |
Commit | 570052cd2a38200ae6aca52e817517513812ec56 |
Headers | show |
Series | [FFmpeg-devel,v4] avcodec/aarch64/hevc: add luma deblock NEON | expand |
Context | Check | Description |
---|---|---|
yinshiyou/make_loongarch64 | success | Make finished |
yinshiyou/make_fate_loongarch64 | success | Make fate finished |
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
On Tue, 27 Feb 2024, J. Dekker wrote: > Benched using single-threaded full decode on an Ampere Altra. > > Bpp Before After Speedup > 8 73,3s 65,2s 1.124x > 10 114,2s 104,0s 1.098x > 12 125,8s 115,7s 1.087x > > Signed-off-by: J. Dekker <jdek@itanimul.li> > --- > > Slightly improved 12bit version. > > libavcodec/aarch64/hevcdsp_deblock_neon.S | 417 ++++++++++++++++++++++ > libavcodec/aarch64/hevcdsp_init_aarch64.c | 18 + > 2 files changed, 435 insertions(+) > > diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S > index 8227f65649..581056a91e 100644 > --- a/libavcodec/aarch64/hevcdsp_deblock_neon.S > +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S > @@ -181,3 +181,420 @@ hevc_h_loop_filter_chroma 12 > hevc_v_loop_filter_chroma 8 > hevc_v_loop_filter_chroma 10 > hevc_v_loop_filter_chroma 12 > + > +.macro hevc_loop_filter_luma_body bitdepth > +function hevc_loop_filter_luma_body_\bitdepth\()_neon, export=0 > +.if \bitdepth > 8 > + lsl w2, w2, #(\bitdepth - 8) // beta <<= BIT_DEPTH - 8 > +.else > + uxtl v0.8h, v0.8b > + uxtl v1.8h, v1.8b > + uxtl v2.8h, v2.8b > + uxtl v3.8h, v3.8b > + uxtl v4.8h, v4.8b > + uxtl v5.8h, v5.8b > + uxtl v6.8h, v6.8b > + uxtl v7.8h, v7.8b > +.endif > + ldr w7, [x3] // tc[0] > + ldr w8, [x3, #4] // tc[1] > + dup v18.4h, w7 > + dup v19.4h, w8 > + trn1 v18.2d, v18.2d, v19.2d > +.if \bitdepth > 8 > + shl v18.8h, v18.8h, #(\bitdepth - 8) > +.endif > + dup v27.8h, w2 // beta > + // tc25 > + shl v19.8h, v18.8h, #2 // * 4 > + add v19.8h, v19.8h, v18.8h // (tc * 5) > + srshr v19.8h, v19.8h, #1 // (tc * 5 + 1) >> 1 > + sshr v17.8h, v27.8h, #2 // beta2 > + > + ////// beta_2 check > + // dp0 = abs(P2 - 2 * P1 + P0) > + add v22.8h, v3.8h, v1.8h > + shl v23.8h, v2.8h, #1 > + sabd v30.8h, v22.8h, v23.8h > + // dq0 = abs(Q2 - 2 * Q1 + Q0) > + add v21.8h, v6.8h, v4.8h > + shl v26.8h, v5.8h, #1 > + sabd v31.8h, v21.8h, v26.8h > + // d0 = dp0 + dq0 > + add v20.8h, v30.8h, v31.8h > + shl v25.8h, v20.8h, #1 > + // (d0 << 1) < beta_2 > + cmgt v23.8h, v17.8h, v25.8h > + > + ////// beta check > + // d0 + d3 < beta > + mov x9, #0xFFFF00000000FFFF > + dup v24.2d, x9 > + and v25.16b, v24.16b, v20.16b > + addp v25.8h, v25.8h, v25.8h // 1+0 0+1 1+0 0+1 > + addp v25.4h, v25.4h, v25.4h // 1+0+0+1 1+0+0+1 > + cmgt v25.4h, v27.4h, v25.4h // lower/upper mask in h[0/1] > + mov w9, v25.s[0] I don't quite understand what this sequence does and/or how our data is laid out in our registers - we have d0 on input in v20, where's d3? An doesn't the "and" throw away half of the input elements here? I see some similar patterns with the masking and handling below as well - I get a feeling that I don't quite understand the algorithm here, and/or the data layout. > +.if \bitdepth > 8 > + ld1 {v0.8h}, [x0], x1 > + ld1 {v1.8h}, [x0], x1 > + ld1 {v2.8h}, [x0], x1 > + ld1 {v3.8h}, [x0], x1 > + ld1 {v4.8h}, [x0], x1 > + ld1 {v5.8h}, [x0], x1 > + ld1 {v6.8h}, [x0], x1 > + ld1 {v7.8h}, [x0] > + mov w14, #((1 << \bitdepth) - 1) For loads like these, we can generally save a bit by using two alternating registers for loading, with a double stride - see e.g. the vp9 loop filter implementations. But that's a micro optimization. Other than that, this mostly looks reasaonble. // Martin
Martin Storsjö <martin@martin.st> writes: > On Tue, 27 Feb 2024, J. Dekker wrote: > >> Benched using single-threaded full decode on an Ampere Altra. >> >> Bpp Before After Speedup >> 8 73,3s 65,2s 1.124x >> 10 114,2s 104,0s 1.098x >> 12 125,8s 115,7s 1.087x >> >> Signed-off-by: J. Dekker <jdek@itanimul.li> >> --- >> >> Slightly improved 12bit version. >> >> libavcodec/aarch64/hevcdsp_deblock_neon.S | 417 ++++++++++++++++++++++ >> libavcodec/aarch64/hevcdsp_init_aarch64.c | 18 + >> 2 files changed, 435 insertions(+) >> >> diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S >> index 8227f65649..581056a91e 100644 >> --- a/libavcodec/aarch64/hevcdsp_deblock_neon.S >> +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S >> @@ -181,3 +181,420 @@ hevc_h_loop_filter_chroma 12 >> hevc_v_loop_filter_chroma 8 >> hevc_v_loop_filter_chroma 10 >> hevc_v_loop_filter_chroma 12 >> + >> +.macro hevc_loop_filter_luma_body bitdepth >> +function hevc_loop_filter_luma_body_\bitdepth\()_neon, export=0 >> +.if \bitdepth > 8 >> + lsl w2, w2, #(\bitdepth - 8) // beta <<= BIT_DEPTH - 8 >> +.else >> + uxtl v0.8h, v0.8b >> + uxtl v1.8h, v1.8b >> + uxtl v2.8h, v2.8b >> + uxtl v3.8h, v3.8b >> + uxtl v4.8h, v4.8b >> + uxtl v5.8h, v5.8b >> + uxtl v6.8h, v6.8b >> + uxtl v7.8h, v7.8b >> +.endif >> + ldr w7, [x3] // tc[0] >> + ldr w8, [x3, #4] // tc[1] >> + dup v18.4h, w7 >> + dup v19.4h, w8 >> + trn1 v18.2d, v18.2d, v19.2d >> +.if \bitdepth > 8 >> + shl v18.8h, v18.8h, #(\bitdepth - 8) >> +.endif >> + dup v27.8h, w2 // beta >> + // tc25 >> + shl v19.8h, v18.8h, #2 // * 4 >> + add v19.8h, v19.8h, v18.8h // (tc * 5) >> + srshr v19.8h, v19.8h, #1 // (tc * 5 + 1) >> 1 >> + sshr v17.8h, v27.8h, #2 // beta2 >> + >> + ////// beta_2 check >> + // dp0 = abs(P2 - 2 * P1 + P0) >> + add v22.8h, v3.8h, v1.8h >> + shl v23.8h, v2.8h, #1 >> + sabd v30.8h, v22.8h, v23.8h >> + // dq0 = abs(Q2 - 2 * Q1 + Q0) >> + add v21.8h, v6.8h, v4.8h >> + shl v26.8h, v5.8h, #1 >> + sabd v31.8h, v21.8h, v26.8h >> + // d0 = dp0 + dq0 >> + add v20.8h, v30.8h, v31.8h >> + shl v25.8h, v20.8h, #1 >> + // (d0 << 1) < beta_2 >> + cmgt v23.8h, v17.8h, v25.8h >> + >> + ////// beta check >> + // d0 + d3 < beta >> + mov x9, #0xFFFF00000000FFFF >> + dup v24.2d, x9 >> + and v25.16b, v24.16b, v20.16b >> + addp v25.8h, v25.8h, v25.8h // 1+0 0+1 1+0 0+1 >> + addp v25.4h, v25.4h, v25.4h // 1+0+0+1 1+0+0+1 >> + cmgt v25.4h, v27.4h, v25.4h // lower/upper mask in h[0/1] >> + mov w9, v25.s[0] > > I don't quite understand what this sequence does and/or how our data is laid > out in our registers - we have d0 on input in v20, where's d3? An doesn't the > "and" throw away half of the input elements here? > > I see some similar patterns with the masking and handling below as well - I get > a feeling that I don't quite understand the algorithm here, and/or the data > layout. We have d0, d1, d2, d3 for both 4 line blocks in v20, mask out d1/d2 and use pair-wise adds to move our data around and calculate d0+d3 together. The first addp just moves elements around, the second addp adds d0 + 0 + 0 + d3. The we can check d0+d3 < beta and use the fact that the compare returns either 0 or -1 and sign-extend to half the register width for a mask. This allows us to calculate both 4 line block masks at the same time in NEON registers. >> +.if \bitdepth > 8 >> + ld1 {v0.8h}, [x0], x1 >> + ld1 {v1.8h}, [x0], x1 >> + ld1 {v2.8h}, [x0], x1 >> + ld1 {v3.8h}, [x0], x1 >> + ld1 {v4.8h}, [x0], x1 >> + ld1 {v5.8h}, [x0], x1 >> + ld1 {v6.8h}, [x0], x1 >> + ld1 {v7.8h}, [x0] >> + mov w14, #((1 << \bitdepth) - 1) > > For loads like these, we can generally save a bit by using two alternating > registers for loading, with a double stride - see e.g. the vp9 loop filter > implementations. But that's a micro optimization. > > Other than that, this mostly looks reasaonble. Will fix on push if no other comments.
On Wed, 28 Feb 2024, J. Dekker wrote: > > Martin Storsjö <martin@martin.st> writes: > >> On Tue, 27 Feb 2024, J. Dekker wrote: >> >>> Benched using single-threaded full decode on an Ampere Altra. >>> >>> Bpp Before After Speedup >>> 8 73,3s 65,2s 1.124x >>> 10 114,2s 104,0s 1.098x >>> 12 125,8s 115,7s 1.087x >>> >>> Signed-off-by: J. Dekker <jdek@itanimul.li> >>> --- >>> >>> Slightly improved 12bit version. >>> >>> libavcodec/aarch64/hevcdsp_deblock_neon.S | 417 ++++++++++++++++++++++ >>> libavcodec/aarch64/hevcdsp_init_aarch64.c | 18 + >>> 2 files changed, 435 insertions(+) >>> >>> diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S >>> index 8227f65649..581056a91e 100644 >>> --- a/libavcodec/aarch64/hevcdsp_deblock_neon.S >>> +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S >>> @@ -181,3 +181,420 @@ hevc_h_loop_filter_chroma 12 >>> hevc_v_loop_filter_chroma 8 >>> hevc_v_loop_filter_chroma 10 >>> hevc_v_loop_filter_chroma 12 >>> + >>> +.macro hevc_loop_filter_luma_body bitdepth >>> +function hevc_loop_filter_luma_body_\bitdepth\()_neon, export=0 >>> +.if \bitdepth > 8 >>> + lsl w2, w2, #(\bitdepth - 8) // beta <<= BIT_DEPTH - 8 >>> +.else >>> + uxtl v0.8h, v0.8b >>> + uxtl v1.8h, v1.8b >>> + uxtl v2.8h, v2.8b >>> + uxtl v3.8h, v3.8b >>> + uxtl v4.8h, v4.8b >>> + uxtl v5.8h, v5.8b >>> + uxtl v6.8h, v6.8b >>> + uxtl v7.8h, v7.8b >>> +.endif >>> + ldr w7, [x3] // tc[0] >>> + ldr w8, [x3, #4] // tc[1] >>> + dup v18.4h, w7 >>> + dup v19.4h, w8 >>> + trn1 v18.2d, v18.2d, v19.2d >>> +.if \bitdepth > 8 >>> + shl v18.8h, v18.8h, #(\bitdepth - 8) >>> +.endif >>> + dup v27.8h, w2 // beta >>> + // tc25 >>> + shl v19.8h, v18.8h, #2 // * 4 >>> + add v19.8h, v19.8h, v18.8h // (tc * 5) >>> + srshr v19.8h, v19.8h, #1 // (tc * 5 + 1) >> 1 >>> + sshr v17.8h, v27.8h, #2 // beta2 >>> + >>> + ////// beta_2 check >>> + // dp0 = abs(P2 - 2 * P1 + P0) >>> + add v22.8h, v3.8h, v1.8h >>> + shl v23.8h, v2.8h, #1 >>> + sabd v30.8h, v22.8h, v23.8h >>> + // dq0 = abs(Q2 - 2 * Q1 + Q0) >>> + add v21.8h, v6.8h, v4.8h >>> + shl v26.8h, v5.8h, #1 >>> + sabd v31.8h, v21.8h, v26.8h >>> + // d0 = dp0 + dq0 >>> + add v20.8h, v30.8h, v31.8h >>> + shl v25.8h, v20.8h, #1 >>> + // (d0 << 1) < beta_2 >>> + cmgt v23.8h, v17.8h, v25.8h >>> + >>> + ////// beta check >>> + // d0 + d3 < beta >>> + mov x9, #0xFFFF00000000FFFF >>> + dup v24.2d, x9 >>> + and v25.16b, v24.16b, v20.16b >>> + addp v25.8h, v25.8h, v25.8h // 1+0 0+1 1+0 0+1 >>> + addp v25.4h, v25.4h, v25.4h // 1+0+0+1 1+0+0+1 >>> + cmgt v25.4h, v27.4h, v25.4h // lower/upper mask in h[0/1] >>> + mov w9, v25.s[0] >> >> I don't quite understand what this sequence does and/or how our data is laid >> out in our registers - we have d0 on input in v20, where's d3? An doesn't the >> "and" throw away half of the input elements here? >> >> I see some similar patterns with the masking and handling below as well - I get >> a feeling that I don't quite understand the algorithm here, and/or the data >> layout. > > We have d0, d1, d2, d3 for both 4 line blocks in v20, mask out d1/d2 and > use pair-wise adds to move our data around and calculate d0+d3 > together. The first addp just moves elements around, the second addp > adds d0 + 0 + 0 + d3. Right, I guess this is the bit that was surprising. I would have expected to have e.g. all the d0 values for e.g. the 8 individual pixels in one SIMD register, and all the d3 values for all pixels in another SIMD register. So as we're operating on 8 pixels in parallel, each of those 8 pixels have their own d0/d3 values, right? Or is this a case where we have just one d0/d3 value for a range of pixels? // Martin
Martin Storsjö <martin@martin.st> writes: > On Wed, 28 Feb 2024, J. Dekker wrote: > >> >> Martin Storsjö <martin@martin.st> writes: >> >>> On Tue, 27 Feb 2024, J. Dekker wrote: >>> >>>> Benched using single-threaded full decode on an Ampere Altra. >>>> >>>> Bpp Before After Speedup >>>> 8 73,3s 65,2s 1.124x >>>> 10 114,2s 104,0s 1.098x >>>> 12 125,8s 115,7s 1.087x >>>> >>>> Signed-off-by: J. Dekker <jdek@itanimul.li> >>>> --- >>>> >>>> Slightly improved 12bit version. >>>> >>>> libavcodec/aarch64/hevcdsp_deblock_neon.S | 417 ++++++++++++++++++++++ >>>> libavcodec/aarch64/hevcdsp_init_aarch64.c | 18 + >>>> 2 files changed, 435 insertions(+) >>>> >>>> diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S >>>> index 8227f65649..581056a91e 100644 >>>> --- a/libavcodec/aarch64/hevcdsp_deblock_neon.S >>>> +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S >>>> @@ -181,3 +181,420 @@ hevc_h_loop_filter_chroma 12 >>>> hevc_v_loop_filter_chroma 8 >>>> hevc_v_loop_filter_chroma 10 >>>> hevc_v_loop_filter_chroma 12 >>>> + >>>> +.macro hevc_loop_filter_luma_body bitdepth >>>> +function hevc_loop_filter_luma_body_\bitdepth\()_neon, export=0 >>>> +.if \bitdepth > 8 >>>> + lsl w2, w2, #(\bitdepth - 8) // beta <<= BIT_DEPTH - 8 >>>> +.else >>>> + uxtl v0.8h, v0.8b >>>> + uxtl v1.8h, v1.8b >>>> + uxtl v2.8h, v2.8b >>>> + uxtl v3.8h, v3.8b >>>> + uxtl v4.8h, v4.8b >>>> + uxtl v5.8h, v5.8b >>>> + uxtl v6.8h, v6.8b >>>> + uxtl v7.8h, v7.8b >>>> +.endif >>>> + ldr w7, [x3] // tc[0] >>>> + ldr w8, [x3, #4] // tc[1] >>>> + dup v18.4h, w7 >>>> + dup v19.4h, w8 >>>> + trn1 v18.2d, v18.2d, v19.2d >>>> +.if \bitdepth > 8 >>>> + shl v18.8h, v18.8h, #(\bitdepth - 8) >>>> +.endif >>>> + dup v27.8h, w2 // beta >>>> + // tc25 >>>> + shl v19.8h, v18.8h, #2 // * 4 >>>> + add v19.8h, v19.8h, v18.8h // (tc * 5) >>>> + srshr v19.8h, v19.8h, #1 // (tc * 5 + 1) >> 1 >>>> + sshr v17.8h, v27.8h, #2 // beta2 >>>> + >>>> + ////// beta_2 check >>>> + // dp0 = abs(P2 - 2 * P1 + P0) >>>> + add v22.8h, v3.8h, v1.8h >>>> + shl v23.8h, v2.8h, #1 >>>> + sabd v30.8h, v22.8h, v23.8h >>>> + // dq0 = abs(Q2 - 2 * Q1 + Q0) >>>> + add v21.8h, v6.8h, v4.8h >>>> + shl v26.8h, v5.8h, #1 >>>> + sabd v31.8h, v21.8h, v26.8h >>>> + // d0 = dp0 + dq0 >>>> + add v20.8h, v30.8h, v31.8h >>>> + shl v25.8h, v20.8h, #1 >>>> + // (d0 << 1) < beta_2 >>>> + cmgt v23.8h, v17.8h, v25.8h >>>> + >>>> + ////// beta check >>>> + // d0 + d3 < beta >>>> + mov x9, #0xFFFF00000000FFFF >>>> + dup v24.2d, x9 >>>> + and v25.16b, v24.16b, v20.16b >>>> + addp v25.8h, v25.8h, v25.8h // 1+0 0+1 1+0 0+1 >>>> + addp v25.4h, v25.4h, v25.4h // 1+0+0+1 1+0+0+1 >>>> + cmgt v25.4h, v27.4h, v25.4h // lower/upper mask in h[0/1] >>>> + mov w9, v25.s[0] >>> >>> I don't quite understand what this sequence does and/or how our data is laid >>> out in our registers - we have d0 on input in v20, where's d3? An doesn't the >>> "and" throw away half of the input elements here? >>> >>> I see some similar patterns with the masking and handling below as well - I get >>> a feeling that I don't quite understand the algorithm here, and/or the data >>> layout. >> >> We have d0, d1, d2, d3 for both 4 line blocks in v20, mask out d1/d2 and >> use pair-wise adds to move our data around and calculate d0+d3 >> together. The first addp just moves elements around, the second addp >> adds d0 + 0 + 0 + d3. > > Right, I guess this is the bit that was surprising. I would have expected to > have e.g. all the d0 values for e.g. the 8 individual pixels in one SIMD > register, and all the d3 values for all pixels in another SIMD register. > > So as we're operating on 8 pixels in parallel, each of those 8 pixels have > their own d0/d3 values, right? Or is this a case where we have just one d0/d3 > value for a range of pixels? Yes, d0/d1/d2/d3 are per 4 lines of 8 pixels, it's because d0 and d3 are calculated within their own line, d0 from line 0, d3 from line 3. Maybe it's more confusing since we are doing both halves of the filter at the same time? v20 contains d0 d1 d2 d3 d0 d1 d2 d3, where the second d0 is distinct from the first. But essentially we're doing the same operation across the entire 8 lines, the filter just makes an overall skip decision for each block of 4 lines based on the sum of the result from line 0 and 3.
On Wed, 28 Feb 2024, J. Dekker wrote: > > Martin Storsjö <martin@martin.st> writes: > >> On Wed, 28 Feb 2024, J. Dekker wrote: >> >>> >>> Martin Storsjö <martin@martin.st> writes: >>> >>>> On Tue, 27 Feb 2024, J. Dekker wrote: >>>> >>>>> Benched using single-threaded full decode on an Ampere Altra. >>>>> >>>>> Bpp Before After Speedup >>>>> 8 73,3s 65,2s 1.124x >>>>> 10 114,2s 104,0s 1.098x >>>>> 12 125,8s 115,7s 1.087x >>>>> >>>>> Signed-off-by: J. Dekker <jdek@itanimul.li> >>>>> --- >>>>> >>>>> Slightly improved 12bit version. >>>>> >>>>> libavcodec/aarch64/hevcdsp_deblock_neon.S | 417 ++++++++++++++++++++++ >>>>> libavcodec/aarch64/hevcdsp_init_aarch64.c | 18 + >>>>> 2 files changed, 435 insertions(+) >>>>> >>>>> diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S >>>>> index 8227f65649..581056a91e 100644 >>>>> --- a/libavcodec/aarch64/hevcdsp_deblock_neon.S >>>>> +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S >>>>> @@ -181,3 +181,420 @@ hevc_h_loop_filter_chroma 12 >>>>> hevc_v_loop_filter_chroma 8 >>>>> hevc_v_loop_filter_chroma 10 >>>>> hevc_v_loop_filter_chroma 12 >>>>> + >>>>> +.macro hevc_loop_filter_luma_body bitdepth >>>>> +function hevc_loop_filter_luma_body_\bitdepth\()_neon, export=0 >>>>> +.if \bitdepth > 8 >>>>> + lsl w2, w2, #(\bitdepth - 8) // beta <<= BIT_DEPTH - 8 >>>>> +.else >>>>> + uxtl v0.8h, v0.8b >>>>> + uxtl v1.8h, v1.8b >>>>> + uxtl v2.8h, v2.8b >>>>> + uxtl v3.8h, v3.8b >>>>> + uxtl v4.8h, v4.8b >>>>> + uxtl v5.8h, v5.8b >>>>> + uxtl v6.8h, v6.8b >>>>> + uxtl v7.8h, v7.8b >>>>> +.endif >>>>> + ldr w7, [x3] // tc[0] >>>>> + ldr w8, [x3, #4] // tc[1] >>>>> + dup v18.4h, w7 >>>>> + dup v19.4h, w8 >>>>> + trn1 v18.2d, v18.2d, v19.2d >>>>> +.if \bitdepth > 8 >>>>> + shl v18.8h, v18.8h, #(\bitdepth - 8) >>>>> +.endif >>>>> + dup v27.8h, w2 // beta >>>>> + // tc25 >>>>> + shl v19.8h, v18.8h, #2 // * 4 >>>>> + add v19.8h, v19.8h, v18.8h // (tc * 5) >>>>> + srshr v19.8h, v19.8h, #1 // (tc * 5 + 1) >> 1 >>>>> + sshr v17.8h, v27.8h, #2 // beta2 >>>>> + >>>>> + ////// beta_2 check >>>>> + // dp0 = abs(P2 - 2 * P1 + P0) >>>>> + add v22.8h, v3.8h, v1.8h >>>>> + shl v23.8h, v2.8h, #1 >>>>> + sabd v30.8h, v22.8h, v23.8h >>>>> + // dq0 = abs(Q2 - 2 * Q1 + Q0) >>>>> + add v21.8h, v6.8h, v4.8h >>>>> + shl v26.8h, v5.8h, #1 >>>>> + sabd v31.8h, v21.8h, v26.8h >>>>> + // d0 = dp0 + dq0 >>>>> + add v20.8h, v30.8h, v31.8h >>>>> + shl v25.8h, v20.8h, #1 >>>>> + // (d0 << 1) < beta_2 >>>>> + cmgt v23.8h, v17.8h, v25.8h >>>>> + >>>>> + ////// beta check >>>>> + // d0 + d3 < beta >>>>> + mov x9, #0xFFFF00000000FFFF >>>>> + dup v24.2d, x9 >>>>> + and v25.16b, v24.16b, v20.16b >>>>> + addp v25.8h, v25.8h, v25.8h // 1+0 0+1 1+0 0+1 >>>>> + addp v25.4h, v25.4h, v25.4h // 1+0+0+1 1+0+0+1 >>>>> + cmgt v25.4h, v27.4h, v25.4h // lower/upper mask in h[0/1] >>>>> + mov w9, v25.s[0] >>>> >>>> I don't quite understand what this sequence does and/or how our data is laid >>>> out in our registers - we have d0 on input in v20, where's d3? An doesn't the >>>> "and" throw away half of the input elements here? >>>> >>>> I see some similar patterns with the masking and handling below as well - I get >>>> a feeling that I don't quite understand the algorithm here, and/or the data >>>> layout. >>> >>> We have d0, d1, d2, d3 for both 4 line blocks in v20, mask out d1/d2 and >>> use pair-wise adds to move our data around and calculate d0+d3 >>> together. The first addp just moves elements around, the second addp >>> adds d0 + 0 + 0 + d3. >> >> Right, I guess this is the bit that was surprising. I would have expected to >> have e.g. all the d0 values for e.g. the 8 individual pixels in one SIMD >> register, and all the d3 values for all pixels in another SIMD register. >> >> So as we're operating on 8 pixels in parallel, each of those 8 pixels have >> their own d0/d3 values, right? Or is this a case where we have just one d0/d3 >> value for a range of pixels? > > Yes, d0/d1/d2/d3 are per 4 lines of 8 pixels, it's because d0 and d3 are > calculated within their own line, d0 from line 0, d3 from line 3. Maybe > it's more confusing since we are doing both halves of the filter at the > same time? v20 contains d0 d1 d2 d3 d0 d1 d2 d3, where the second d0 is > distinct from the first. > > But essentially we're doing the same operation across the entire 8 > lines, the filter just makes an overall skip decision for each block of > 4 lines based on the sum of the result from line 0 and 3. Ah, right, I see. I guess this makes sense then. Thanks! Thus, no further objections to it; the optimizing of loading/storing can be done separately. // Martin
Martin Storsjö <martin@martin.st> writes: > On Wed, 28 Feb 2024, J. Dekker wrote: > >> >> Martin Storsjö <martin@martin.st> writes: >> >>> On Wed, 28 Feb 2024, J. Dekker wrote: >>> >>>> >>>> Martin Storsjö <martin@martin.st> writes: >>>> >>>>> On Tue, 27 Feb 2024, J. Dekker wrote: >>>>> >>>>>> Benched using single-threaded full decode on an Ampere Altra. >>>>>> >>>>>> Bpp Before After Speedup >>>>>> 8 73,3s 65,2s 1.124x >>>>>> 10 114,2s 104,0s 1.098x >>>>>> 12 125,8s 115,7s 1.087x >>>>>> >>>>>> Signed-off-by: J. Dekker <jdek@itanimul.li> >>>>>> --- >>>>>> >>>>>> Slightly improved 12bit version. >>>>>> >>>>>> libavcodec/aarch64/hevcdsp_deblock_neon.S | 417 ++++++++++++++++++++++ >>>>>> libavcodec/aarch64/hevcdsp_init_aarch64.c | 18 + >>>>>> 2 files changed, 435 insertions(+) >>>>>> >>>>>> diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S >>>>>> b/libavcodec/aarch64/hevcdsp_deblock_neon.S >>>>>> index 8227f65649..581056a91e 100644 >>>>>> --- a/libavcodec/aarch64/hevcdsp_deblock_neon.S >>>>>> +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S >>>>>> @@ -181,3 +181,420 @@ hevc_h_loop_filter_chroma 12 >>>>>> hevc_v_loop_filter_chroma 8 >>>>>> hevc_v_loop_filter_chroma 10 >>>>>> hevc_v_loop_filter_chroma 12 >>>>>> + >>>>>> +.macro hevc_loop_filter_luma_body bitdepth >>>>>> +function hevc_loop_filter_luma_body_\bitdepth\()_neon, export=0 >>>>>> +.if \bitdepth > 8 >>>>>> + lsl w2, w2, #(\bitdepth - 8) // beta <<= BIT_DEPTH - 8 >>>>>> +.else >>>>>> + uxtl v0.8h, v0.8b >>>>>> + uxtl v1.8h, v1.8b >>>>>> + uxtl v2.8h, v2.8b >>>>>> + uxtl v3.8h, v3.8b >>>>>> + uxtl v4.8h, v4.8b >>>>>> + uxtl v5.8h, v5.8b >>>>>> + uxtl v6.8h, v6.8b >>>>>> + uxtl v7.8h, v7.8b >>>>>> +.endif >>>>>> + ldr w7, [x3] // tc[0] >>>>>> + ldr w8, [x3, #4] // tc[1] >>>>>> + dup v18.4h, w7 >>>>>> + dup v19.4h, w8 >>>>>> + trn1 v18.2d, v18.2d, v19.2d >>>>>> +.if \bitdepth > 8 >>>>>> + shl v18.8h, v18.8h, #(\bitdepth - 8) >>>>>> +.endif >>>>>> + dup v27.8h, w2 // beta >>>>>> + // tc25 >>>>>> + shl v19.8h, v18.8h, #2 // * 4 >>>>>> + add v19.8h, v19.8h, v18.8h // (tc * 5) >>>>>> + srshr v19.8h, v19.8h, #1 // (tc * 5 + 1) >> 1 >>>>>> + sshr v17.8h, v27.8h, #2 // beta2 >>>>>> + >>>>>> + ////// beta_2 check >>>>>> + // dp0 = abs(P2 - 2 * P1 + P0) >>>>>> + add v22.8h, v3.8h, v1.8h >>>>>> + shl v23.8h, v2.8h, #1 >>>>>> + sabd v30.8h, v22.8h, v23.8h >>>>>> + // dq0 = abs(Q2 - 2 * Q1 + Q0) >>>>>> + add v21.8h, v6.8h, v4.8h >>>>>> + shl v26.8h, v5.8h, #1 >>>>>> + sabd v31.8h, v21.8h, v26.8h >>>>>> + // d0 = dp0 + dq0 >>>>>> + add v20.8h, v30.8h, v31.8h >>>>>> + shl v25.8h, v20.8h, #1 >>>>>> + // (d0 << 1) < beta_2 >>>>>> + cmgt v23.8h, v17.8h, v25.8h >>>>>> + >>>>>> + ////// beta check >>>>>> + // d0 + d3 < beta >>>>>> + mov x9, #0xFFFF00000000FFFF >>>>>> + dup v24.2d, x9 >>>>>> + and v25.16b, v24.16b, v20.16b >>>>>> + addp v25.8h, v25.8h, v25.8h // 1+0 0+1 1+0 0+1 >>>>>> + addp v25.4h, v25.4h, v25.4h // 1+0+0+1 1+0+0+1 >>>>>> + cmgt v25.4h, v27.4h, v25.4h // lower/upper mask in h[0/1] >>>>>> + mov w9, v25.s[0] >>>>> >>>>> I don't quite understand what this sequence does and/or how our data is >>>>> laid >>>>> out in our registers - we have d0 on input in v20, where's d3? An doesn't >>>>> the >>>>> "and" throw away half of the input elements here? >>>>> >>>>> I see some similar patterns with the masking and handling below as well - >>>>> I get >>>>> a feeling that I don't quite understand the algorithm here, and/or the >>>>> data >>>>> layout. >>>> >>>> We have d0, d1, d2, d3 for both 4 line blocks in v20, mask out d1/d2 and >>>> use pair-wise adds to move our data around and calculate d0+d3 >>>> together. The first addp just moves elements around, the second addp >>>> adds d0 + 0 + 0 + d3. >>> >>> Right, I guess this is the bit that was surprising. I would have expected to >>> have e.g. all the d0 values for e.g. the 8 individual pixels in one SIMD >>> register, and all the d3 values for all pixels in another SIMD register. >>> >>> So as we're operating on 8 pixels in parallel, each of those 8 pixels have >>> their own d0/d3 values, right? Or is this a case where we have just one >>> d0/d3 >>> value for a range of pixels? >> >> Yes, d0/d1/d2/d3 are per 4 lines of 8 pixels, it's because d0 and d3 are >> calculated within their own line, d0 from line 0, d3 from line 3. Maybe >> it's more confusing since we are doing both halves of the filter at the >> same time? v20 contains d0 d1 d2 d3 d0 d1 d2 d3, where the second d0 is >> distinct from the first. >> >> But essentially we're doing the same operation across the entire 8 >> lines, the filter just makes an overall skip decision for each block of >> 4 lines based on the sum of the result from line 0 and 3. > > Ah, right, I see. I guess this makes sense then. Thanks! > > Thus, no further objections to it; the optimizing of loading/storing can be > done separately. > Thanks, pushed. Will post load/store optimization.
diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S index 8227f65649..581056a91e 100644 --- a/libavcodec/aarch64/hevcdsp_deblock_neon.S +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S @@ -181,3 +181,420 @@ hevc_h_loop_filter_chroma 12 hevc_v_loop_filter_chroma 8 hevc_v_loop_filter_chroma 10 hevc_v_loop_filter_chroma 12 + +.macro hevc_loop_filter_luma_body bitdepth +function hevc_loop_filter_luma_body_\bitdepth\()_neon, export=0 +.if \bitdepth > 8 + lsl w2, w2, #(\bitdepth - 8) // beta <<= BIT_DEPTH - 8 +.else + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + uxtl v2.8h, v2.8b + uxtl v3.8h, v3.8b + uxtl v4.8h, v4.8b + uxtl v5.8h, v5.8b + uxtl v6.8h, v6.8b + uxtl v7.8h, v7.8b +.endif + ldr w7, [x3] // tc[0] + ldr w8, [x3, #4] // tc[1] + dup v18.4h, w7 + dup v19.4h, w8 + trn1 v18.2d, v18.2d, v19.2d +.if \bitdepth > 8 + shl v18.8h, v18.8h, #(\bitdepth - 8) +.endif + dup v27.8h, w2 // beta + // tc25 + shl v19.8h, v18.8h, #2 // * 4 + add v19.8h, v19.8h, v18.8h // (tc * 5) + srshr v19.8h, v19.8h, #1 // (tc * 5 + 1) >> 1 + sshr v17.8h, v27.8h, #2 // beta2 + + ////// beta_2 check + // dp0 = abs(P2 - 2 * P1 + P0) + add v22.8h, v3.8h, v1.8h + shl v23.8h, v2.8h, #1 + sabd v30.8h, v22.8h, v23.8h + // dq0 = abs(Q2 - 2 * Q1 + Q0) + add v21.8h, v6.8h, v4.8h + shl v26.8h, v5.8h, #1 + sabd v31.8h, v21.8h, v26.8h + // d0 = dp0 + dq0 + add v20.8h, v30.8h, v31.8h + shl v25.8h, v20.8h, #1 + // (d0 << 1) < beta_2 + cmgt v23.8h, v17.8h, v25.8h + + ////// beta check + // d0 + d3 < beta + mov x9, #0xFFFF00000000FFFF + dup v24.2d, x9 + and v25.16b, v24.16b, v20.16b + addp v25.8h, v25.8h, v25.8h // 1+0 0+1 1+0 0+1 + addp v25.4h, v25.4h, v25.4h // 1+0+0+1 1+0+0+1 + cmgt v25.4h, v27.4h, v25.4h // lower/upper mask in h[0/1] + mov w9, v25.s[0] + cmp w9, #0 + sxtl v26.4s, v25.4h + sxtl v16.2d, v26.2s // full skip mask + b.eq 3f // skip both blocks + + // TODO: we can check the full skip mask with the weak/strong mask to + // potentially skip weak or strong calculation entirely if we only have one + + ////// beta_3 check + // abs(P3 - P0) + abs(Q3 - Q0) < beta_3 + sshr v17.8h, v17.8h, #1 // beta_3 + sabd v20.8h, v0.8h, v3.8h + saba v20.8h, v7.8h, v4.8h + cmgt v21.8h, v17.8h, v20.8h + + and v23.16b, v23.16b, v21.16b + + ////// tc25 check + // abs(P0 - Q0) < tc25 + sabd v20.8h, v3.8h, v4.8h + cmgt v21.8h, v19.8h, v20.8h + + and v23.16b, v23.16b, v21.16b + + ////// Generate low/high line max from lines 0/3/4/7 + // mask out lines 2/3/5/6 + not v20.16b, v24.16b // 0x0000FFFFFFFF0000 + orr v23.16b, v23.16b, v20.16b + + // generate weak/strong mask + uminp v23.8h, v23.8h, v23.8h // extend to singles + sxtl v23.4s, v23.4h + uminp v26.4s, v23.4s, v23.4s // check lines + // extract to gpr + ext v25.16b, v26.16b, v26.16b, #2 + zip1 v17.4s, v26.4s, v26.4s + mov w12, v25.s[0] + mov w11, #0x0000FFFF + mov w13, #0xFFFF0000 + // FFFF FFFF -> strong strong + // FFFF 0000 -> strong weak + // 0000 FFFF -> weak strong + // 0000 0000 -> weak weak + cmp w12, w13 + b.hi 0f // only strong/strong, skip weak nd_p/nd_q calc + + ////// weak nd_p/nd_q + // d0+d3 + and v30.16b, v30.16b, v24.16b // d0 __ __ d3 d4 __ __ d7 + and v31.16b, v31.16b, v24.16b + addp v30.8h, v30.8h, v30.8h // [d0+__ __+d3 d4+__ __+d7] [ ... ] + addp v31.8h, v31.8h, v31.8h // [d0+d3 d4+d7] + addp v30.4h, v30.4h, v30.4h + addp v31.4h, v31.4h, v31.4h + + // ((beta + (beta >> 1)) >> 3) + sshr v21.8h, v27.8h, #1 + add v21.8h, v21.8h, v27.8h + sshr v21.8h, v21.8h, #3 + + // nd_p = dp0 + dp3 < ((beta + (beta >> 1)) >> 3) + cmgt v30.8h, v21.8h, v30.8h + // nd_q = dq0 + dq3 < ((beta + (beta >> 1)) >> 3) + cmgt v31.8h, v21.8h, v31.8h + + sxtl v30.4s, v30.4h + sxtl v31.4s, v31.4h + sxtl v28.2d, v30.2s + sxtl v29.2d, v31.2s + + cmp w12, w11 + b.lo 1f // can only be weak weak, skip strong + +0: // STRONG FILTER + + // P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc3, tc3); + add v21.8h, v2.8h, v3.8h // (p1 + p0 + add v21.8h, v4.8h, v21.8h // + q0) + shl v21.8h, v21.8h, #1 // * 2 + add v22.8h, v1.8h, v5.8h // (p2 + q1) + add v21.8h, v22.8h, v21.8h // + + srshr v21.8h, v21.8h, #3 // >> 3 + sub v21.8h, v21.8h, v3.8h // - p0 + + // P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2); + + add v22.8h, v1.8h, v2.8h + add v23.8h, v3.8h, v4.8h + add v22.8h, v22.8h, v23.8h + srshr v22.8h, v22.8h, #2 + sub v22.8h, v22.8h, v2.8h + + // P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc, tc); + + add v23.8h, v0.8h, v1.8h // p3 + p2 + add v24.8h, v3.8h, v4.8h // p0 + q0 + shl v23.8h, v23.8h, #1 // * 2 + add v23.8h, v23.8h, v24.8h + add v24.8h, v1.8h, v2.8h // p2 + p1 + add v23.8h, v23.8h, v24.8h + srshr v23.8h, v23.8h, #3 + sub v23.8h, v23.8h, v1.8h + + // Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc3, tc3); + add v24.8h, v3.8h, v4.8h // (p0 + q0 + add v24.8h, v5.8h, v24.8h // + q1) + shl v24.8h, v24.8h, #1 // * 2 + add v25.8h, v2.8h, v6.8h // (p1 + q2) + add v24.8h, v25.8h, v24.8h // + + srshr v24.8h, v24.8h, #3 // >> 3 + sub v24.8h, v24.8h, v4.8h // - q0 + + // Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2); + + add v25.8h, v6.8h, v5.8h + add v26.8h, v3.8h, v4.8h + add v25.8h, v25.8h, v26.8h + srshr v25.8h, v25.8h, #2 + sub v25.8h, v25.8h, v5.8h + + // Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc, tc); + + add v26.8h, v7.8h, v6.8h + add v27.8h, v6.8h, v5.8h + shl v26.8h, v26.8h, #1 + add v26.8h, v26.8h, v27.8h + add v27.8h, v3.8h, v4.8h + add v26.8h, v26.8h, v27.8h + srshr v26.8h, v26.8h, #3 + sub v26.8h, v26.8h, v6.8h + + // this clip should work properly + shl v30.8h, v18.8h, #1 // tc2 + neg v31.8h, v30.8h // -tc2 + clip v31.8h, v30.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h + + and v21.16b, v21.16b, v16.16b + and v22.16b, v22.16b, v16.16b + and v23.16b, v23.16b, v16.16b + and v24.16b, v24.16b, v16.16b + and v25.16b, v25.16b, v16.16b + and v26.16b, v26.16b, v16.16b + + add v23.8h, v23.8h, v1.8h // careful + add v22.8h, v22.8h, v2.8h + add v21.8h, v21.8h, v3.8h + add v24.8h, v24.8h, v4.8h + add v25.8h, v25.8h, v5.8h + add v26.8h, v26.8h, v6.8h + + cmp w12, w13 + b.hi 2f // only strong/strong, skip weak + +1: // WEAK FILTER + + // delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4 +.if \bitdepth < 12 + sub v27.8h, v4.8h, v3.8h // q0 - p0 + shl v30.8h, v27.8h, #3 // * 8 + add v27.8h, v27.8h, v30.8h // 9 * (q0 - p0) + + sub v30.8h, v5.8h, v2.8h // q1 - p1 + shl v31.8h, v30.8h, #1 // * 2 + + sub v27.8h, v27.8h, v31.8h + sub v27.8h, v27.8h, v30.8h // - 3 * (q1 - p1) + srshr v27.8h, v27.8h, #4 +.else + sub v19.8h, v4.8h, v3.8h // q0 - p0 + sub v20.8h, v5.8h, v2.8h // q1 - p1 + + sshll v30.4s, v19.4h, #3 // * 8 + sshll2 v31.4s, v19.8h, #3 + + shl v27.8h, v20.8h, #1 + + saddw v30.4s, v30.4s, v19.4h // 9 * (q0 - p0) + saddw2 v31.4s, v31.4s, v19.8h + + saddl v19.4s, v27.4h, v20.4h // 3 * (q1 - p1) + saddl2 v20.4s, v27.8h, v20.8h + + sub v19.4s, v30.4s, v19.4s + sub v20.4s, v31.4s, v20.4s + + sqrshrn v27.4h, v19.4s, #4 + sqrshrn2 v27.8h, v20.4s, #4 +.endif + + // delta0 10tc check mask + shl v30.8h, v18.8h, #1 // * 2 + shl v31.8h, v18.8h, #3 // * 8 + add v30.8h, v30.8h, v31.8h // 10 * tc + abs v31.8h, v27.8h + cmgt v20.8h, v30.8h, v31.8h // abs(delta0) < 10 * tc + + and v20.16b, v20.16b, v16.16b // combine with full mask + + neg v31.8h, v18.8h // -tc + clip v31.8h, v18.8h, v27.8h // delta0 = av_clip(delta0, -tc, tc) + + // deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2) + add v30.8h, v1.8h, v3.8h + srshr v30.8h, v30.8h, #1 + sub v30.8h, v30.8h, v2.8h + add v30.8h, v30.8h, v27.8h + sshr v30.8h, v30.8h, #1 + + // p3 p2 p1 p0 q0 q1 q2 q3 + // v0 v1 v2 v3 v4 v5 v6 v7 + + // deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2); + add v31.8h, v6.8h, v4.8h + srshr v31.8h, v31.8h, #1 + sub v31.8h, v31.8h, v5.8h + sub v31.8h, v31.8h, v27.8h + sshr v31.8h, v31.8h, #1 + + // apply nd_p nd_q mask to deltap1/deltaq1 + and v30.16b, v30.16b, v28.16b + and v31.16b, v31.16b, v29.16b + + // apply full skip mask to deltap1/deltaq1/delta0 + and v30.16b, v30.16b, v20.16b + and v27.16b, v27.16b, v20.16b + and v31.16b, v31.16b, v20.16b + + // clip P1/Q1 to -tc_2, tc_2 + sshr v18.8h, v18.8h, #1 // tc2 + neg v28.8h, v18.8h + clip v28.8h, v18.8h, v30.8h, v31.8h + + // P0 = av_clip_pixel(p0 + delta0) + // Q0 = av_clip_pixel(q0 - delta0) + add v29.8h, v3.8h, v27.8h // P0 + sub v27.8h, v4.8h, v27.8h // Q0 + + // P1 = av_clip_pixel(p1 + deltap1) + // Q1 = av_clip_pixel(q1 + deltaq1) + add v30.8h, v2.8h, v30.8h // P1 + add v31.8h, v5.8h, v31.8h // Q1 + +2: // MIX WEAK/STRONG + + mov v19.16b, v1.16b + mov v20.16b, v6.16b + // copy selection mask + mov v1.16b, v17.16b + mov v2.16b, v17.16b + mov v3.16b, v17.16b + mov v4.16b, v17.16b + mov v5.16b, v17.16b + mov v6.16b, v17.16b + // select + bsl v1.16b, v23.16b, v19.16b // P2 strong/orig + bsl v2.16b, v22.16b, v30.16b // P1 strong/weak + bsl v3.16b, v21.16b, v29.16b // P0 strong/weak + bsl v4.16b, v24.16b, v27.16b // Q0 strong/weak + bsl v5.16b, v25.16b, v31.16b // Q1 strong/weak + bsl v6.16b, v26.16b, v20.16b // Q2 strong/orig + // NOTE: Q3/P3 are unchanged + +.if \bitdepth > 8 + movi v19.8h, #0 + dup v20.8h, w14 + clip v19.8h, v20.8h, v1.8h, v2.8h, v3.8h, v4.8h, v5.8h, v6.8h +.else + sqxtun v0.8b, v0.8h + sqxtun v1.8b, v1.8h + sqxtun v2.8b, v2.8h + sqxtun v3.8b, v3.8h + sqxtun v4.8b, v4.8h + sqxtun v5.8b, v5.8h + sqxtun v6.8b, v6.8h + sqxtun v7.8b, v7.8h +.endif + ret +3: ret x6 +endfunc +.endm + +hevc_loop_filter_luma_body 8 +hevc_loop_filter_luma_body 10 +hevc_loop_filter_luma_body 12 + +// hevc_v_loop_filter_luma(uint8_t *pix, ptrdiff_t stride, int beta, const int32_t *tc, const uint8_t *no_p, const uint8_t *no_q) + +.macro hevc_loop_filter_luma dir, bitdepth +function ff_hevc_\dir\()_loop_filter_luma_\bitdepth\()_neon, export=1 + mov x6, x30 +.ifc \dir, v +.if \bitdepth > 8 + sub x0, x0, #8 +.else + sub x0, x0, #4 +.endif +.else + sub x0, x0, x1, lsl #2 // -4 * xstride +.endif + mov x10, x0 +.if \bitdepth > 8 + ld1 {v0.8h}, [x0], x1 + ld1 {v1.8h}, [x0], x1 + ld1 {v2.8h}, [x0], x1 + ld1 {v3.8h}, [x0], x1 + ld1 {v4.8h}, [x0], x1 + ld1 {v5.8h}, [x0], x1 + ld1 {v6.8h}, [x0], x1 + ld1 {v7.8h}, [x0] + mov w14, #((1 << \bitdepth) - 1) +.ifc \dir, v + transpose_8x8H v0, v1, v2, v3, v4, v5, v6, v7, v16, v17 +.endif +.else + ld1 {v0.8b}, [x0], x1 + ld1 {v1.8b}, [x0], x1 + ld1 {v2.8b}, [x0], x1 + ld1 {v3.8b}, [x0], x1 + ld1 {v4.8b}, [x0], x1 + ld1 {v5.8b}, [x0], x1 + ld1 {v6.8b}, [x0], x1 + ld1 {v7.8b}, [x0] +.ifc \dir, v + transpose_8x8B v0, v1, v2, v3, v4, v5, v6, v7, v16, v17 +.endif +.endif + bl hevc_loop_filter_luma_body_\bitdepth\()_neon +.if \bitdepth > 8 +.ifc \dir, v + transpose_8x8H v0, v1, v2, v3, v4, v5, v6, v7, v16, v17 +.endif + st1 {v0.8h}, [x10], x1 + st1 {v1.8h}, [x10], x1 + st1 {v2.8h}, [x10], x1 + st1 {v3.8h}, [x10], x1 + st1 {v4.8h}, [x10], x1 + st1 {v5.8h}, [x10], x1 + st1 {v6.8h}, [x10], x1 + st1 {v7.8h}, [x10] +.else +.ifc \dir, v + transpose_8x8B v0, v1, v2, v3, v4, v5, v6, v7, v16, v17 +.endif + st1 {v0.8b}, [x10], x1 + st1 {v1.8b}, [x10], x1 + st1 {v2.8b}, [x10], x1 + st1 {v3.8b}, [x10], x1 + st1 {v4.8b}, [x10], x1 + st1 {v5.8b}, [x10], x1 + st1 {v6.8b}, [x10], x1 + st1 {v7.8b}, [x10] +.endif + ret x6 +endfunc +.endm + +hevc_loop_filter_luma h, 8 +hevc_loop_filter_luma h, 10 +hevc_loop_filter_luma h, 12 + +hevc_loop_filter_luma v, 8 +hevc_loop_filter_luma v, 10 +hevc_loop_filter_luma v, 12 diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 687b6cc5c3..04692aa98e 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -38,6 +38,18 @@ void ff_hevc_h_loop_filter_chroma_10_neon(uint8_t *_pix, ptrdiff_t _stride, const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q); void ff_hevc_h_loop_filter_chroma_12_neon(uint8_t *_pix, ptrdiff_t _stride, const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q); +void ff_hevc_v_loop_filter_luma_8_neon(uint8_t *_pix, ptrdiff_t _stride, int beta, + const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q); +void ff_hevc_v_loop_filter_luma_10_neon(uint8_t *_pix, ptrdiff_t _stride, int beta, + const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q); +void ff_hevc_v_loop_filter_luma_12_neon(uint8_t *_pix, ptrdiff_t _stride, int beta, + const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q); +void ff_hevc_h_loop_filter_luma_8_neon(uint8_t *_pix, ptrdiff_t _stride, int beta, + const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q); +void ff_hevc_h_loop_filter_luma_10_neon(uint8_t *_pix, ptrdiff_t _stride, int beta, + const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q); +void ff_hevc_h_loop_filter_luma_12_neon(uint8_t *_pix, ptrdiff_t _stride, int beta, + const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q); void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride); void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, const int16_t *coeffs, @@ -291,6 +303,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) if (!have_neon(cpu_flags)) return; if (bit_depth == 8) { + c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_neon; + c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_neon; c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_neon; c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_neon; c->add_residual[0] = ff_hevc_add_residual_4x4_8_neon; @@ -379,6 +393,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) } if (bit_depth == 10) { + c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_neon; + c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_neon; c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_neon; c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_neon; c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon; @@ -395,6 +411,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_neon; } if (bit_depth == 12) { + c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_neon; + c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_neon; c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_neon; c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_neon; c->add_residual[0] = ff_hevc_add_residual_4x4_12_neon;
Benched using single-threaded full decode on an Ampere Altra. Bpp Before After Speedup 8 73,3s 65,2s 1.124x 10 114,2s 104,0s 1.098x 12 125,8s 115,7s 1.087x Signed-off-by: J. Dekker <jdek@itanimul.li> --- Slightly improved 12bit version. libavcodec/aarch64/hevcdsp_deblock_neon.S | 417 ++++++++++++++++++++++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 18 + 2 files changed, 435 insertions(+)