Message ID | 20220713204854.3114817-5-martin@martin.st |
---|---|
State | Accepted |
Commit | 4136405c86162063e45d40d55c9985f348d4ea0a |
Headers | show |
Series | [FFmpeg-devel,1/5] libavcodec: aarch64: Don't clobber v8 in the h%4 case in ff_pix_abs16_xy2_neon | expand |
Context | Check | Description |
---|---|---|
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
If the max height is just 16, then this should be fine. I assumed that h could have a much higher value (>1024), but if that is not the case, then this is a useful optimization. Thanks! -- Jonathan Swinney On 7/13/22, 3:49 PM, "Martin Storsjö" <martin@martin.st> wrote: CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe. The max height is 16; the max difference per pixel is 255, and a .8h element can easily contain 16*255, thus keep accumulating in two .8h vectors, and just do the final accumulation at the end. This requires a minor register renumbering in ff_pix_abs16_xy2_neon. Before: Cortex A53 A72 A73 Graviton 3 pix_abs_0_0_neon: 97.7 47.0 37.5 22.7 pix_abs_0_1_neon: 154.0 59.0 52.0 25.0 pix_abs_0_3_neon: 179.7 96.7 87.5 41.2 After: pix_abs_0_0_neon: 96.0 39.2 31.2 22.0 pix_abs_0_1_neon: 150.7 59.7 46.2 23.7 pix_abs_0_3_neon: 175.7 83.7 81.7 38.2 --- I remember suggesting this before, and having it dismissed for some reason I don't remember - maybe that the element size wasn't big enough to hold the intermediate results? At least as far as I can see, it can hold the results just fine, and this passes all tests. --- libavcodec/aarch64/me_cmp_neon.S | 102 +++++++++++++++---------------- 1 file changed, 49 insertions(+), 53 deletions(-) diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S index 89546869fb..cda7ce0408 100644 --- a/libavcodec/aarch64/me_cmp_neon.S +++ b/libavcodec/aarch64/me_cmp_neon.S @@ -27,15 +27,16 @@ function ff_pix_abs16_neon, export=1 // x3 ptrdiff_t stride // w4 int h cmp w4, #4 // if h < 4, jump to completion section - movi v18.4S, #0 // clear result accumulator + movi v16.8h, #0 // clear result accumulator + movi v17.8h, #0 // clear result accumulator b.lt 2f 1: ld1 {v0.16b}, [x1], x3 // load pix1 ld1 {v4.16b}, [x2], x3 // load pix2 ld1 {v1.16b}, [x1], x3 // load pix1 ld1 {v5.16b}, [x2], x3 // load pix2 - uabdl v16.8h, v0.8b, v4.8b // absolute difference accumulate - uabdl2 v17.8h, v0.16b, v4.16b + uabal v16.8h, v0.8b, v4.8b // absolute difference accumulate + uabal2 v17.8h, v0.16b, v4.16b ld1 {v2.16b}, [x1], x3 // load pix1 ld1 {v6.16b}, [x2], x3 // load pix2 uabal v16.8h, v1.8b, v5.8b // absolute difference accumulate @@ -48,27 +49,26 @@ function ff_pix_abs16_neon, export=1 uabal v16.8h, v3.8b, v7.8b uabal2 v17.8h, v3.16b, v7.16b cmp w4, #4 // if h >= 4, loop - add v16.8h, v16.8h, v17.8h - uaddlv s16, v16.8h // add up everything in v16 accumulator - add d18, d16, d18 // add to the end result register b.ge 1b cbnz w4, 2f // if iterations remain, jump to completion section - fmov w0, s18 // copy result to general purpose register + add v16.8h, v16.8h, v17.8h + uaddlv s16, v16.8h // add up everything in v16 accumulator + fmov w0, s16 // copy result to general purpose register ret 2: ld1 {v0.16b}, [x1], x3 // load pix1 ld1 {v4.16b}, [x2], x3 // load pix2 - uabdl v16.8h, v0.8b, v4.8b // absolute difference accumulate - uabal2 v16.8h, v0.16b, v4.16b subs w4, w4, #1 // h -= 1 - addv h16, v16.8h // add up v16 - add d18, d16, d18 // add to result + uabal v16.8h, v0.8b, v4.8b // absolute difference accumulate + uabal2 v17.8h, v0.16b, v4.16b b.ne 2b - fmov w0, s18 // copy result to general purpose register + add v16.8h, v16.8h, v17.8h + uaddlv s16, v16.8h // add up everything in v16 accumulator + fmov w0, s16 // copy result to general purpose register ret endfunc @@ -80,7 +80,8 @@ function ff_pix_abs16_xy2_neon, export=1 // w4 int h add x5, x2, x3 // use x5 to hold uint8_t *pix3 - movi v0.2d, #0 // initialize the result register + movi v21.8h, #0 // initialize the result register + movi v22.8h, #0 // initialize the result register // Load initial pix2 values for either the unrolled version or completion version. ldur q4, [x2, #1] // load pix2+1 @@ -119,15 +120,15 @@ function ff_pix_abs16_xy2_neon, export=1 uaddl v2.8h, v6.8b, v7.8b // pix3 + pix3+1 0..7 uaddl2 v3.8h, v6.16b, v7.16b // pix3 + pix3+1 8..15 - ldur q22, [x5, #1] // load pix3+1 + ldur q7, [x5, #1] // load pix3+1 add v26.8h, v30.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above add v27.8h, v31.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above - uabdl v24.8h, v1.8b, v23.8b // absolute difference 0..7, i=0 - uabdl2 v23.8h, v1.16b, v23.16b // absolute difference 8..15, i=0 + uabal v21.8h, v1.8b, v23.8b // absolute difference 0..7, i=0 + uabal2 v22.8h, v1.16b, v23.16b // absolute difference 8..15, i=0 - ld1 {v21.16b}, [x5], x3 // load pix3 + ld1 {v6.16b}, [x5], x3 // load pix3 ld1 {v20.16b}, [x1], x3 // load pix1 rshrn v26.8b, v26.8h, #2 // shift right 2 0..7 (rounding shift right) @@ -140,33 +141,33 @@ function ff_pix_abs16_xy2_neon, export=1 rshrn v28.8b, v28.8h, #2 // shift right 2 0..7 (rounding shift right) rshrn2 v28.16b, v29.8h, #2 // shift right 2 8..15 - uabal v24.8h, v16.8b, v26.8b // absolute difference 0..7, i=1 - uabal2 v23.8h, v16.16b, v26.16b // absolute difference 8..15, i=1 + uabal v21.8h, v16.8b, v26.8b // absolute difference 0..7, i=1 + uabal2 v22.8h, v16.16b, v26.16b // absolute difference 8..15, i=1 - uaddl v2.8h, v21.8b, v22.8b // pix3 + pix3+1 0..7 - uaddl2 v3.8h, v21.16b, v22.16b // pix3 + pix3+1 8..15 + uaddl v2.8h, v6.8b, v7.8b // pix3 + pix3+1 0..7 + uaddl2 v3.8h, v6.16b, v7.16b // pix3 + pix3+1 8..15 add v30.8h, v4.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above add v31.8h, v5.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above rshrn v30.8b, v30.8h, #2 // shift right 2 0..7 (rounding shift right) rshrn2 v30.16b, v31.8h, #2 // shift right 2 8..15 - uabal v24.8h, v17.8b, v28.8b // absolute difference 0..7, i=2 - uabal2 v23.8h, v17.16b, v28.16b // absolute difference 8..15, i=2 - sub w4, w4, #4 // h -= 4 - uabal v24.8h, v20.8b, v30.8b // absolute difference 0..7, i=3 - uabal2 v23.8h, v20.16b, v30.16b // absolute difference 8..15, i=3 + uabal v21.8h, v17.8b, v28.8b // absolute difference 0..7, i=2 + uabal2 v22.8h, v17.16b, v28.16b // absolute difference 8..15, i=2 cmp w4, #4 // loop if h >= 4 - add v4.8h, v23.8h, v24.8h - uaddlv s4, v4.8h // finish adding up accumulated values - add d0, d0, d4 // add the value to the top level accumulator + + uabal v21.8h, v20.8b, v30.8b // absolute difference 0..7, i=3 + uabal2 v22.8h, v20.16b, v30.16b // absolute difference 8..15, i=3 b.ge 1b cbnz w4, 2f // if iterations remain jump to completion section + add v4.8h, v21.8h, v22.8h + uaddlv s0, v4.8h // finish adding up accumulated values + fmov w0, s0 // copy result to general purpose register ret 2: @@ -182,20 +183,18 @@ function ff_pix_abs16_xy2_neon, export=1 add v16.8h, v2.8h, v18.8h // add up 0..7, using pix2 + pix2+1 values from previous iteration add v17.8h, v3.8h, v19.8h // add up 8..15, using pix2 + pix2+1 values from previous iteration // divide by 4 to compute the average of values summed above - urshr v16.8h, v16.8h, #2 // shift right by 2 0..7 (rounding shift right) - urshr v17.8h, v17.8h, #2 // shift right by 2 8..15 - - uxtl2 v7.8h, v1.16b // 8->16 bits pix1 8..15 - uxtl v1.8h, v1.8b // 8->16 bits pix1 0..7 + rshrn v16.8b, v16.8h, #2 // shift right by 2 0..7 (rounding shift right) + rshrn2 v16.16b, v17.8h, #2 // shift right by 2 8..15 - uabd v6.8h, v1.8h, v16.8h // absolute difference 0..7 - uaba v6.8h, v7.8h, v17.8h // absolute difference accumulate 8..15 + uabal v21.8h, v1.8b, v16.8b // absolute difference 0..7 + uabal2 v22.8h, v1.16b, v16.16b // absolute difference accumulate 8..15 mov v2.16b, v18.16b // pix3 -> pix2 mov v3.16b, v19.16b // pix3+1 -> pix2+1 - uaddlv s6, v6.8h // add up accumulator in v6 - add d0, d0, d6 // add to the final result b.ne 2b // loop if h > 0 + + add v4.8h, v21.8h, v22.8h + uaddlv s0, v4.8h // finish adding up accumulated values fmov w0, s0 // copy result to general purpose register ret endfunc @@ -209,7 +208,8 @@ function ff_pix_abs16_x2_neon, export=1 cmp w4, #4 // initialize buffers - movi d20, #0 + movi v16.8h, #0 + movi v17.8h, #0 add x5, x2, #1 // pix2 + 1 b.lt 2f @@ -224,9 +224,9 @@ function ff_pix_abs16_x2_neon, export=1 ld1 {v2.16b}, [x5], x3 urhadd v30.16b, v1.16b, v2.16b ld1 {v0.16b}, [x1], x3 - uabdl v16.8h, v0.8b, v30.8b + uabal v16.8h, v0.8b, v30.8b ld1 {v4.16b}, [x2], x3 - uabdl2 v17.8h, v0.16b, v30.16b + uabal2 v17.8h, v0.16b, v30.16b ld1 {v5.16b}, [x5], x3 urhadd v29.16b, v4.16b, v5.16b ld1 {v3.16b}, [x1], x3 @@ -238,20 +238,15 @@ function ff_pix_abs16_x2_neon, export=1 ld1 {v6.16b}, [x1], x3 uabal v16.8h, v6.8b, v28.8b ld1 {v24.16b}, [x2], x3 + sub w4, w4, #4 uabal2 v17.8h, v6.16b, v28.16b ld1 {v25.16b}, [x5], x3 urhadd v27.16b, v24.16b, v25.16b ld1 {v23.16b}, [x1], x3 + cmp w4, #4 uabal v16.8h, v23.8b, v27.8b uabal2 v17.8h, v23.16b, v27.16b - sub w4, w4, #4 - - add v16.8h, v16.8h, v17.8h - uaddlv s16, v16.8h - cmp w4, #4 - add d20, d20, d16 - b.ge 1b cbz w4, 3f @@ -259,18 +254,19 @@ function ff_pix_abs16_x2_neon, export=1 2: ld1 {v1.16b}, [x2], x3 ld1 {v2.16b}, [x5], x3 + subs w4, w4, #1 urhadd v29.16b, v1.16b, v2.16b ld1 {v0.16b}, [x1], x3 - uabd v28.16b, v0.16b, v29.16b + uabal v16.8h, v0.8b, v29.8b + uabal2 v17.8h, v0.16b, v29.16b - uaddlv h28, v28.16b - subs w4, w4, #1 - add d20, d20, d28 b.ne 2b 3: - fmov w0, s20 + add v16.8h, v16.8h, v17.8h + uaddlv s16, v16.8h + fmov w0, s16 ret endfunc -- 2.25.1
On Fri, 15 Jul 2022, Swinney, Jonathan wrote: > If the max height is just 16, then this should be fine. I assumed that h > could have a much higher value (>1024), but if that is not the case, > then this is a useful optimization. At least according to the me_cmp.h header, which says: /* Motion estimation: * h is limited to { width / 2, width, 2 * width }, * but never larger than 16 and never smaller than 2. * Although currently h < 4 is not used as functions with * width < 8 are neither used nor implemented. */ So with that in mind, I think this should be safe to do. // Martin
On Fri, Jul 15, 2022 at 10:56:03PM +0300, Martin Storsjö wrote: > On Fri, 15 Jul 2022, Swinney, Jonathan wrote: > > > If the max height is just 16, then this should be fine. I assumed that h > > could have a much higher value (>1024), but if that is not the case, > > then this is a useful optimization. > > At least according to the me_cmp.h header, which says: > > /* Motion estimation: > * h is limited to { width / 2, width, 2 * width }, > * but never larger than 16 and never smaller than 2. > * Although currently h < 4 is not used as functions with > * width < 8 are neither used nor implemented. */ These rules where written with support for encoding of all standard formats in mind at the time that was written. today it may make sense to extend these rules to cover the things which where created since then thx [...]
On Fri, 15 Jul 2022, Michael Niedermayer wrote: > On Fri, Jul 15, 2022 at 10:56:03PM +0300, Martin Storsjö wrote: >> On Fri, 15 Jul 2022, Swinney, Jonathan wrote: >> >>> If the max height is just 16, then this should be fine. I assumed that h >>> could have a much higher value (>1024), but if that is not the case, >>> then this is a useful optimization. >> >> At least according to the me_cmp.h header, which says: >> >> /* Motion estimation: >> * h is limited to { width / 2, width, 2 * width }, >> * but never larger than 16 and never smaller than 2. >> * Although currently h < 4 is not used as functions with >> * width < 8 are neither used nor implemented. */ > > These rules where written with support for encoding of all > standard formats in mind at the time that was written. > today it may make sense to extend these rules to cover the > things which where created since then Right, but if that suddenly changes, such a change also must expect that it might need updates to all assembly implementations that implement that interface currently. Right now, both the defacto case (any callers in the codebase) and the explicit documentation says that it can't be called with parameters outside of that range. Even if it's raised from the current <= 16, this particular optimization should be fine as long as h <= 256 - which should be fine for at least all current-gen mainstream codecs since, I think? // Martin
On Fri, 15 Jul 2022, Michael Niedermayer wrote: > On Fri, Jul 15, 2022 at 10:56:03PM +0300, Martin Storsjö wrote: >> On Fri, 15 Jul 2022, Swinney, Jonathan wrote: >> >>> If the max height is just 16, then this should be fine. I assumed that h >>> could have a much higher value (>1024), but if that is not the case, >>> then this is a useful optimization. >> >> At least according to the me_cmp.h header, which says: >> >> /* Motion estimation: >> * h is limited to { width / 2, width, 2 * width }, >> * but never larger than 16 and never smaller than 2. >> * Although currently h < 4 is not used as functions with >> * width < 8 are neither used nor implemented. */ > > These rules where written with support for encoding of all > standard formats in mind at the time that was written. > today it may make sense to extend these rules to cover the > things which where created since then Also - if extending this, I would expect that you want other widths too. Right now, most of the functions seem to be arranged such as [0] is w=16 and [0] is w=8. For those, for w=8, it seems to be mostly hardcoded to only assume h=8, while the w=16 functions actually honor the h parameter. If it ever would be relevant with h>256, that wouldn't be for the existing w=8 or w=16 functions, but for newer functions with a larger width too. So I think this patch is safe (which works for h up to 256), and if someone wants to extend the interface later, that can be done. // Martin
On Sat, Jul 16, 2022 at 12:25:37AM +0300, Martin Storsjö wrote: > On Fri, 15 Jul 2022, Michael Niedermayer wrote: > > > On Fri, Jul 15, 2022 at 10:56:03PM +0300, Martin Storsjö wrote: > > > On Fri, 15 Jul 2022, Swinney, Jonathan wrote: > > > > > > > If the max height is just 16, then this should be fine. I assumed that h > > > > could have a much higher value (>1024), but if that is not the case, > > > > then this is a useful optimization. > > > > > > At least according to the me_cmp.h header, which says: > > > > > > /* Motion estimation: > > > * h is limited to { width / 2, width, 2 * width }, > > > * but never larger than 16 and never smaller than 2. > > > * Although currently h < 4 is not used as functions with > > > * width < 8 are neither used nor implemented. */ > > > > These rules where written with support for encoding of all > > standard formats in mind at the time that was written. > > today it may make sense to extend these rules to cover the > > things which where created since then > > Right, but if that suddenly changes, such a change also must expect that it > might need updates to all assembly implementations that implement that > interface currently. Right now, both the defacto case (any callers in the > codebase) and the explicit documentation says that it can't be called with > parameters outside of that range. What i meant was that newly added functions should be more flexible than these old rules. That is 2 sets of rules 1. What a caller ATM can do and expect to work (thats whats written there) 2. What an implementor of new functions should make sure is supported > > Even if it's raised from the current <= 16, this particular optimization > should be fine as long as h <= 256 - which should be fine for at least all > current-gen mainstream codecs since, I think? [...]
On Sat, 16 Jul 2022, Michael Niedermayer wrote: > On Sat, Jul 16, 2022 at 12:25:37AM +0300, Martin Storsjö wrote: >> On Fri, 15 Jul 2022, Michael Niedermayer wrote: >> >>> On Fri, Jul 15, 2022 at 10:56:03PM +0300, Martin Storsjö wrote: >>>> On Fri, 15 Jul 2022, Swinney, Jonathan wrote: >>>> >>>>> If the max height is just 16, then this should be fine. I assumed that h >>>>> could have a much higher value (>1024), but if that is not the case, >>>>> then this is a useful optimization. >>>> >>>> At least according to the me_cmp.h header, which says: >>>> >>>> /* Motion estimation: >>>> * h is limited to { width / 2, width, 2 * width }, >>>> * but never larger than 16 and never smaller than 2. >>>> * Although currently h < 4 is not used as functions with >>>> * width < 8 are neither used nor implemented. */ >>> >>> These rules where written with support for encoding of all >>> standard formats in mind at the time that was written. >>> today it may make sense to extend these rules to cover the >>> things which where created since then >> >> Right, but if that suddenly changes, such a change also must expect that it >> might need updates to all assembly implementations that implement that >> interface currently. Right now, both the defacto case (any callers in the >> codebase) and the explicit documentation says that it can't be called with >> parameters outside of that range. > > What i meant was that newly added functions should be more flexible than > these old rules. That is 2 sets of rules > 1. What a caller ATM can do and expect to work (thats whats written there) > 2. What an implementor of new functions should make sure is supported With 2., do you mean if adding a new function into the same struct, or if implementing the existing pix_abs[0][..] functions? If you mean new implementations of the existing function interface, you say they "should be more flexible". How flexible must they be? Is it ok to assume h<=256 for the w=16 functions? Gradually increasing the requirements for existing function interfaces like you suggest is really problematic. If we want to require more of the functions, we should document it, and extend the checkasm test to test that new requirement - which also extends the requirement to the existing functions. If we don't have a checkasm test for the required behaviour, we can pretty much assume it's broken, even in new implementations. // Martin
Hi, On Sat, Jul 16, 2022 at 7:23 PM Michael Niedermayer <michael@niedermayer.cc> wrote: > What i meant was that newly added functions should be more flexible than > these old rules. That is 2 sets of rules > 1. What a caller ATM can do and expect to work (thats whats written there) > 2. What an implementor of new functions should make sure is supported > What's the use case exactly that you'd like to support that we currently don't? Ronald
On Sat, Jul 16, 2022 at 08:50:24PM +0800, Ronald S. Bultje wrote: > Hi, > > On Sat, Jul 16, 2022 at 7:23 PM Michael Niedermayer <michael@niedermayer.cc> > wrote: > > > What i meant was that newly added functions should be more flexible than > > these old rules. That is 2 sets of rules > > 1. What a caller ATM can do and expect to work (thats whats written there) > > 2. What an implementor of new functions should make sure is supported > > > > What's the use case exactly that you'd like to support that we currently > don't? The idea is to have general purpose functions which can be used if someone wants to write a new encoder. (this is kind of not a different goal, its rather that the original limitations where based on mpeg2/mpeg4/h264 which was the most flexible at the time) thx [...]
On Sat, Jul 16, 2022 at 03:30:23PM +0300, Martin Storsjö wrote: > On Sat, 16 Jul 2022, Michael Niedermayer wrote: > > > On Sat, Jul 16, 2022 at 12:25:37AM +0300, Martin Storsjö wrote: > > > On Fri, 15 Jul 2022, Michael Niedermayer wrote: > > > > > > > On Fri, Jul 15, 2022 at 10:56:03PM +0300, Martin Storsjö wrote: > > > > > On Fri, 15 Jul 2022, Swinney, Jonathan wrote: > > > > > > > > > > > If the max height is just 16, then this should be fine. I assumed that h > > > > > > could have a much higher value (>1024), but if that is not the case, > > > > > > then this is a useful optimization. > > > > > > > > > > At least according to the me_cmp.h header, which says: > > > > > > > > > > /* Motion estimation: > > > > > * h is limited to { width / 2, width, 2 * width }, > > > > > * but never larger than 16 and never smaller than 2. > > > > > * Although currently h < 4 is not used as functions with > > > > > * width < 8 are neither used nor implemented. */ > > > > > > > > These rules where written with support for encoding of all > > > > standard formats in mind at the time that was written. > > > > today it may make sense to extend these rules to cover the > > > > things which where created since then > > > > > > Right, but if that suddenly changes, such a change also must expect that it > > > might need updates to all assembly implementations that implement that > > > interface currently. Right now, both the defacto case (any callers in the > > > codebase) and the explicit documentation says that it can't be called with > > > parameters outside of that range. > > > > What i meant was that newly added functions should be more flexible than > > these old rules. That is 2 sets of rules > > 1. What a caller ATM can do and expect to work (thats whats written there) > > 2. What an implementor of new functions should make sure is supported > > With 2., do you mean if adding a new function into the same struct, or if > implementing the existing pix_abs[0][..] functions? i would say both > > If you mean new implementations of the existing function interface, you say > they "should be more flexible". How flexible must they be? Is it ok to > assume h<=256 for the w=16 functions? i think thats fine > > Gradually increasing the requirements for existing function interfaces like > you suggest is really problematic. why ? iam really just saying "when you add new code, dont base it on old limitations" And i suggest this because its much easier to do when a function is added than when later one wants to use it When i originally wrote this list of restrictions it didnt list what our code actually used but tried to look ahead to what we would need when we write encoders for all standard formats at that time. We never wrote a h264 encoder but this API was designed to support it. > > If we want to require more of the functions, we should document it, and > extend the checkasm test to test that new requirement - which also extends > the requirement to the existing functions. If we don't have a checkasm test > for the required behaviour, we can pretty much assume it's broken, even in > new implementations. yes [...]
On Sat, 16 Jul 2022, Michael Niedermayer wrote: > On Sat, Jul 16, 2022 at 03:30:23PM +0300, Martin Storsjö wrote: >> On Sat, 16 Jul 2022, Michael Niedermayer wrote: >> >>> On Sat, Jul 16, 2022 at 12:25:37AM +0300, Martin Storsjö wrote: >>>> On Fri, 15 Jul 2022, Michael Niedermayer wrote: >>>> >>>>> On Fri, Jul 15, 2022 at 10:56:03PM +0300, Martin Storsjö wrote: >>>>>> On Fri, 15 Jul 2022, Swinney, Jonathan wrote: >>>>>> >>>>>>> If the max height is just 16, then this should be fine. I assumed that h >>>>>>> could have a much higher value (>1024), but if that is not the case, >>>>>>> then this is a useful optimization. >>>>>> >>>>>> At least according to the me_cmp.h header, which says: >>>>>> >>>>>> /* Motion estimation: >>>>>> * h is limited to { width / 2, width, 2 * width }, >>>>>> * but never larger than 16 and never smaller than 2. >>>>>> * Although currently h < 4 is not used as functions with >>>>>> * width < 8 are neither used nor implemented. */ >>>>> >>>>> These rules where written with support for encoding of all >>>>> standard formats in mind at the time that was written. >>>>> today it may make sense to extend these rules to cover the >>>>> things which where created since then >>>> >>>> Right, but if that suddenly changes, such a change also must expect that it >>>> might need updates to all assembly implementations that implement that >>>> interface currently. Right now, both the defacto case (any callers in the >>>> codebase) and the explicit documentation says that it can't be called with >>>> parameters outside of that range. >>> >>> What i meant was that newly added functions should be more flexible than >>> these old rules. That is 2 sets of rules >>> 1. What a caller ATM can do and expect to work (thats whats written there) >>> 2. What an implementor of new functions should make sure is supported >> >> With 2., do you mean if adding a new function into the same struct, or if >> implementing the existing pix_abs[0][..] functions? > > i would say both > > >> >> If you mean new implementations of the existing function interface, you say >> they "should be more flexible". How flexible must they be? Is it ok to >> assume h<=256 for the w=16 functions? > > i think thats fine Ok, I'll go ahead and push this then. >> >> Gradually increasing the requirements for existing function interfaces like >> you suggest is really problematic. > > why ? > iam really just saying > "when you add new code, dont base it on old limitations" For this case, I just quoted what the header said, which seemed authoritative to me - but it's fine for me with a wider spec too, up to h=256. But saying arbitrarily "any height" really inhibits what you can do in asm. Also if I shouldn't reference that limitation in the header, please update/reword it, as it's actively misleading for anyone working on this right now. And in general, we can design for an intended use case and calculate whether it should work or not - but as long as it's not tested, those cases will often have hidden bugs - see e.g. patch 1/5 in this series. // Martin
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S index 89546869fb..cda7ce0408 100644 --- a/libavcodec/aarch64/me_cmp_neon.S +++ b/libavcodec/aarch64/me_cmp_neon.S @@ -27,15 +27,16 @@ function ff_pix_abs16_neon, export=1 // x3 ptrdiff_t stride // w4 int h cmp w4, #4 // if h < 4, jump to completion section - movi v18.4S, #0 // clear result accumulator + movi v16.8h, #0 // clear result accumulator + movi v17.8h, #0 // clear result accumulator b.lt 2f 1: ld1 {v0.16b}, [x1], x3 // load pix1 ld1 {v4.16b}, [x2], x3 // load pix2 ld1 {v1.16b}, [x1], x3 // load pix1 ld1 {v5.16b}, [x2], x3 // load pix2 - uabdl v16.8h, v0.8b, v4.8b // absolute difference accumulate - uabdl2 v17.8h, v0.16b, v4.16b + uabal v16.8h, v0.8b, v4.8b // absolute difference accumulate + uabal2 v17.8h, v0.16b, v4.16b ld1 {v2.16b}, [x1], x3 // load pix1 ld1 {v6.16b}, [x2], x3 // load pix2 uabal v16.8h, v1.8b, v5.8b // absolute difference accumulate @@ -48,27 +49,26 @@ function ff_pix_abs16_neon, export=1 uabal v16.8h, v3.8b, v7.8b uabal2 v17.8h, v3.16b, v7.16b cmp w4, #4 // if h >= 4, loop - add v16.8h, v16.8h, v17.8h - uaddlv s16, v16.8h // add up everything in v16 accumulator - add d18, d16, d18 // add to the end result register b.ge 1b cbnz w4, 2f // if iterations remain, jump to completion section - fmov w0, s18 // copy result to general purpose register + add v16.8h, v16.8h, v17.8h + uaddlv s16, v16.8h // add up everything in v16 accumulator + fmov w0, s16 // copy result to general purpose register ret 2: ld1 {v0.16b}, [x1], x3 // load pix1 ld1 {v4.16b}, [x2], x3 // load pix2 - uabdl v16.8h, v0.8b, v4.8b // absolute difference accumulate - uabal2 v16.8h, v0.16b, v4.16b subs w4, w4, #1 // h -= 1 - addv h16, v16.8h // add up v16 - add d18, d16, d18 // add to result + uabal v16.8h, v0.8b, v4.8b // absolute difference accumulate + uabal2 v17.8h, v0.16b, v4.16b b.ne 2b - fmov w0, s18 // copy result to general purpose register + add v16.8h, v16.8h, v17.8h + uaddlv s16, v16.8h // add up everything in v16 accumulator + fmov w0, s16 // copy result to general purpose register ret endfunc @@ -80,7 +80,8 @@ function ff_pix_abs16_xy2_neon, export=1 // w4 int h add x5, x2, x3 // use x5 to hold uint8_t *pix3 - movi v0.2d, #0 // initialize the result register + movi v21.8h, #0 // initialize the result register + movi v22.8h, #0 // initialize the result register // Load initial pix2 values for either the unrolled version or completion version. ldur q4, [x2, #1] // load pix2+1 @@ -119,15 +120,15 @@ function ff_pix_abs16_xy2_neon, export=1 uaddl v2.8h, v6.8b, v7.8b // pix3 + pix3+1 0..7 uaddl2 v3.8h, v6.16b, v7.16b // pix3 + pix3+1 8..15 - ldur q22, [x5, #1] // load pix3+1 + ldur q7, [x5, #1] // load pix3+1 add v26.8h, v30.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above add v27.8h, v31.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above - uabdl v24.8h, v1.8b, v23.8b // absolute difference 0..7, i=0 - uabdl2 v23.8h, v1.16b, v23.16b // absolute difference 8..15, i=0 + uabal v21.8h, v1.8b, v23.8b // absolute difference 0..7, i=0 + uabal2 v22.8h, v1.16b, v23.16b // absolute difference 8..15, i=0 - ld1 {v21.16b}, [x5], x3 // load pix3 + ld1 {v6.16b}, [x5], x3 // load pix3 ld1 {v20.16b}, [x1], x3 // load pix1 rshrn v26.8b, v26.8h, #2 // shift right 2 0..7 (rounding shift right) @@ -140,33 +141,33 @@ function ff_pix_abs16_xy2_neon, export=1 rshrn v28.8b, v28.8h, #2 // shift right 2 0..7 (rounding shift right) rshrn2 v28.16b, v29.8h, #2 // shift right 2 8..15 - uabal v24.8h, v16.8b, v26.8b // absolute difference 0..7, i=1 - uabal2 v23.8h, v16.16b, v26.16b // absolute difference 8..15, i=1 + uabal v21.8h, v16.8b, v26.8b // absolute difference 0..7, i=1 + uabal2 v22.8h, v16.16b, v26.16b // absolute difference 8..15, i=1 - uaddl v2.8h, v21.8b, v22.8b // pix3 + pix3+1 0..7 - uaddl2 v3.8h, v21.16b, v22.16b // pix3 + pix3+1 8..15 + uaddl v2.8h, v6.8b, v7.8b // pix3 + pix3+1 0..7 + uaddl2 v3.8h, v6.16b, v7.16b // pix3 + pix3+1 8..15 add v30.8h, v4.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above add v31.8h, v5.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above rshrn v30.8b, v30.8h, #2 // shift right 2 0..7 (rounding shift right) rshrn2 v30.16b, v31.8h, #2 // shift right 2 8..15 - uabal v24.8h, v17.8b, v28.8b // absolute difference 0..7, i=2 - uabal2 v23.8h, v17.16b, v28.16b // absolute difference 8..15, i=2 - sub w4, w4, #4 // h -= 4 - uabal v24.8h, v20.8b, v30.8b // absolute difference 0..7, i=3 - uabal2 v23.8h, v20.16b, v30.16b // absolute difference 8..15, i=3 + uabal v21.8h, v17.8b, v28.8b // absolute difference 0..7, i=2 + uabal2 v22.8h, v17.16b, v28.16b // absolute difference 8..15, i=2 cmp w4, #4 // loop if h >= 4 - add v4.8h, v23.8h, v24.8h - uaddlv s4, v4.8h // finish adding up accumulated values - add d0, d0, d4 // add the value to the top level accumulator + + uabal v21.8h, v20.8b, v30.8b // absolute difference 0..7, i=3 + uabal2 v22.8h, v20.16b, v30.16b // absolute difference 8..15, i=3 b.ge 1b cbnz w4, 2f // if iterations remain jump to completion section + add v4.8h, v21.8h, v22.8h + uaddlv s0, v4.8h // finish adding up accumulated values + fmov w0, s0 // copy result to general purpose register ret 2: @@ -182,20 +183,18 @@ function ff_pix_abs16_xy2_neon, export=1 add v16.8h, v2.8h, v18.8h // add up 0..7, using pix2 + pix2+1 values from previous iteration add v17.8h, v3.8h, v19.8h // add up 8..15, using pix2 + pix2+1 values from previous iteration // divide by 4 to compute the average of values summed above - urshr v16.8h, v16.8h, #2 // shift right by 2 0..7 (rounding shift right) - urshr v17.8h, v17.8h, #2 // shift right by 2 8..15 - - uxtl2 v7.8h, v1.16b // 8->16 bits pix1 8..15 - uxtl v1.8h, v1.8b // 8->16 bits pix1 0..7 + rshrn v16.8b, v16.8h, #2 // shift right by 2 0..7 (rounding shift right) + rshrn2 v16.16b, v17.8h, #2 // shift right by 2 8..15 - uabd v6.8h, v1.8h, v16.8h // absolute difference 0..7 - uaba v6.8h, v7.8h, v17.8h // absolute difference accumulate 8..15 + uabal v21.8h, v1.8b, v16.8b // absolute difference 0..7 + uabal2 v22.8h, v1.16b, v16.16b // absolute difference accumulate 8..15 mov v2.16b, v18.16b // pix3 -> pix2 mov v3.16b, v19.16b // pix3+1 -> pix2+1 - uaddlv s6, v6.8h // add up accumulator in v6 - add d0, d0, d6 // add to the final result b.ne 2b // loop if h > 0 + + add v4.8h, v21.8h, v22.8h + uaddlv s0, v4.8h // finish adding up accumulated values fmov w0, s0 // copy result to general purpose register ret endfunc @@ -209,7 +208,8 @@ function ff_pix_abs16_x2_neon, export=1 cmp w4, #4 // initialize buffers - movi d20, #0 + movi v16.8h, #0 + movi v17.8h, #0 add x5, x2, #1 // pix2 + 1 b.lt 2f @@ -224,9 +224,9 @@ function ff_pix_abs16_x2_neon, export=1 ld1 {v2.16b}, [x5], x3 urhadd v30.16b, v1.16b, v2.16b ld1 {v0.16b}, [x1], x3 - uabdl v16.8h, v0.8b, v30.8b + uabal v16.8h, v0.8b, v30.8b ld1 {v4.16b}, [x2], x3 - uabdl2 v17.8h, v0.16b, v30.16b + uabal2 v17.8h, v0.16b, v30.16b ld1 {v5.16b}, [x5], x3 urhadd v29.16b, v4.16b, v5.16b ld1 {v3.16b}, [x1], x3 @@ -238,20 +238,15 @@ function ff_pix_abs16_x2_neon, export=1 ld1 {v6.16b}, [x1], x3 uabal v16.8h, v6.8b, v28.8b ld1 {v24.16b}, [x2], x3 + sub w4, w4, #4 uabal2 v17.8h, v6.16b, v28.16b ld1 {v25.16b}, [x5], x3 urhadd v27.16b, v24.16b, v25.16b ld1 {v23.16b}, [x1], x3 + cmp w4, #4 uabal v16.8h, v23.8b, v27.8b uabal2 v17.8h, v23.16b, v27.16b - sub w4, w4, #4 - - add v16.8h, v16.8h, v17.8h - uaddlv s16, v16.8h - cmp w4, #4 - add d20, d20, d16 - b.ge 1b cbz w4, 3f @@ -259,18 +254,19 @@ function ff_pix_abs16_x2_neon, export=1 2: ld1 {v1.16b}, [x2], x3 ld1 {v2.16b}, [x5], x3 + subs w4, w4, #1 urhadd v29.16b, v1.16b, v2.16b ld1 {v0.16b}, [x1], x3 - uabd v28.16b, v0.16b, v29.16b + uabal v16.8h, v0.8b, v29.8b + uabal2 v17.8h, v0.16b, v29.16b - uaddlv h28, v28.16b - subs w4, w4, #1 - add d20, d20, d28 b.ne 2b 3: - fmov w0, s20 + add v16.8h, v16.8h, v17.8h + uaddlv s16, v16.8h + fmov w0, s16 ret endfunc