diff mbox series

[FFmpeg-devel,5/5] aarch64: me_cmp: Don't do uaddlv once per iteration

Message ID 20220713204854.3114817-5-martin@martin.st
State Accepted
Commit 4136405c86162063e45d40d55c9985f348d4ea0a
Headers show
Series [FFmpeg-devel,1/5] libavcodec: aarch64: Don't clobber v8 in the h%4 case in ff_pix_abs16_xy2_neon | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Martin Storsjö July 13, 2022, 8:48 p.m. UTC
The max height is 16; the max difference per pixel is 255, and
a .8h element can easily contain 16*255, thus keep accumulating
in two .8h vectors, and just do the final accumulation at the
end.

This requires a minor register renumbering in ff_pix_abs16_xy2_neon.

Before:       Cortex A53    A72    A73   Graviton 3
pix_abs_0_0_neon:   97.7   47.0   37.5   22.7
pix_abs_0_1_neon:  154.0   59.0   52.0   25.0
pix_abs_0_3_neon:  179.7   96.7   87.5   41.2
After:
pix_abs_0_0_neon:   96.0   39.2   31.2   22.0
pix_abs_0_1_neon:  150.7   59.7   46.2   23.7
pix_abs_0_3_neon:  175.7   83.7   81.7   38.2

---
I remember suggesting this before, and having it dismissed for
some reason I don't remember - maybe that the element size wasn't
big enough to hold the intermediate results? At least as far as I
can see, it can hold the results just fine, and this passes all
tests.
---
 libavcodec/aarch64/me_cmp_neon.S | 102 +++++++++++++++----------------
 1 file changed, 49 insertions(+), 53 deletions(-)

Comments

Swinney, Jonathan July 15, 2022, 7:32 p.m. UTC | #1
If the max height is just 16, then this should be fine. I assumed that h could have a much higher value (>1024), but if that is not the case, then this is a useful optimization.

Thanks!

-- 

Jonathan Swinney

On 7/13/22, 3:49 PM, "Martin Storsjö" <martin@martin.st> wrote:

    CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.



    The max height is 16; the max difference per pixel is 255, and
    a .8h element can easily contain 16*255, thus keep accumulating
    in two .8h vectors, and just do the final accumulation at the
    end.

    This requires a minor register renumbering in ff_pix_abs16_xy2_neon.

    Before:       Cortex A53    A72    A73   Graviton 3
    pix_abs_0_0_neon:   97.7   47.0   37.5   22.7
    pix_abs_0_1_neon:  154.0   59.0   52.0   25.0
    pix_abs_0_3_neon:  179.7   96.7   87.5   41.2
    After:
    pix_abs_0_0_neon:   96.0   39.2   31.2   22.0
    pix_abs_0_1_neon:  150.7   59.7   46.2   23.7
    pix_abs_0_3_neon:  175.7   83.7   81.7   38.2

    ---
    I remember suggesting this before, and having it dismissed for
    some reason I don't remember - maybe that the element size wasn't
    big enough to hold the intermediate results? At least as far as I
    can see, it can hold the results just fine, and this passes all
    tests.
    ---
     libavcodec/aarch64/me_cmp_neon.S | 102 +++++++++++++++----------------
     1 file changed, 49 insertions(+), 53 deletions(-)

    diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
    index 89546869fb..cda7ce0408 100644
    --- a/libavcodec/aarch64/me_cmp_neon.S
    +++ b/libavcodec/aarch64/me_cmp_neon.S
    @@ -27,15 +27,16 @@ function ff_pix_abs16_neon, export=1
             // x3           ptrdiff_t stride
             // w4           int h
             cmp             w4, #4                      // if h < 4, jump to completion section
    -        movi            v18.4S, #0                  // clear result accumulator
    +        movi            v16.8h, #0                  // clear result accumulator
    +        movi            v17.8h, #0                  // clear result accumulator
             b.lt            2f
     1:
             ld1             {v0.16b}, [x1], x3          // load pix1
             ld1             {v4.16b}, [x2], x3          // load pix2
             ld1             {v1.16b}, [x1], x3          // load pix1
             ld1             {v5.16b}, [x2], x3          // load pix2
    -        uabdl           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
    -        uabdl2          v17.8h, v0.16b, v4.16b
    +        uabal           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
    +        uabal2          v17.8h, v0.16b, v4.16b
             ld1             {v2.16b}, [x1], x3          // load pix1
             ld1             {v6.16b}, [x2], x3          // load pix2
             uabal           v16.8h, v1.8b, v5.8b        // absolute difference accumulate
    @@ -48,27 +49,26 @@ function ff_pix_abs16_neon, export=1
             uabal           v16.8h, v3.8b, v7.8b
             uabal2          v17.8h, v3.16b, v7.16b
             cmp             w4, #4                      // if h >= 4, loop
    -        add             v16.8h, v16.8h, v17.8h
    -        uaddlv          s16, v16.8h                 // add up everything in v16 accumulator
    -        add             d18, d16, d18               // add to the end result register

             b.ge            1b
             cbnz            w4, 2f                      // if iterations remain, jump to completion section

    -        fmov            w0, s18                     // copy result to general purpose register
    +        add             v16.8h, v16.8h, v17.8h
    +        uaddlv          s16, v16.8h                 // add up everything in v16 accumulator
    +        fmov            w0, s16                     // copy result to general purpose register
             ret

     2:
             ld1             {v0.16b}, [x1], x3          // load pix1
             ld1             {v4.16b}, [x2], x3          // load pix2
    -        uabdl           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
    -        uabal2          v16.8h, v0.16b, v4.16b
             subs            w4, w4, #1                  // h -= 1
    -        addv            h16, v16.8h                 // add up v16
    -        add             d18, d16, d18               // add to result
    +        uabal           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
    +        uabal2          v17.8h, v0.16b, v4.16b
             b.ne            2b

    -        fmov            w0, s18                     // copy result to general purpose register
    +        add             v16.8h, v16.8h, v17.8h
    +        uaddlv          s16, v16.8h                 // add up everything in v16 accumulator
    +        fmov            w0, s16                     // copy result to general purpose register
             ret
     endfunc

    @@ -80,7 +80,8 @@ function ff_pix_abs16_xy2_neon, export=1
             // w4           int h

             add             x5, x2, x3                  // use x5 to hold uint8_t *pix3
    -        movi            v0.2d, #0                   // initialize the result register
    +        movi            v21.8h, #0                  // initialize the result register
    +        movi            v22.8h, #0                  // initialize the result register

             // Load initial pix2 values for either the unrolled version or completion version.
             ldur            q4, [x2, #1]                // load pix2+1
    @@ -119,15 +120,15 @@ function ff_pix_abs16_xy2_neon, export=1
             uaddl           v2.8h, v6.8b, v7.8b         // pix3 + pix3+1 0..7
             uaddl2          v3.8h, v6.16b, v7.16b       // pix3 + pix3+1 8..15

    -        ldur            q22, [x5, #1]               // load pix3+1
    +        ldur            q7, [x5, #1]                // load pix3+1

             add             v26.8h, v30.8h, v2.8h       // add up 0..7, using pix2 + pix2+1 values from pix3 above
             add             v27.8h, v31.8h, v3.8h       // add up 8..15, using pix2 + pix2+1 values from pix3 above

    -        uabdl           v24.8h, v1.8b,  v23.8b      // absolute difference 0..7, i=0
    -        uabdl2          v23.8h, v1.16b, v23.16b     // absolute difference 8..15, i=0
    +        uabal           v21.8h, v1.8b,  v23.8b      // absolute difference 0..7, i=0
    +        uabal2          v22.8h, v1.16b, v23.16b     // absolute difference 8..15, i=0

    -        ld1             {v21.16b}, [x5], x3         // load pix3
    +        ld1             {v6.16b}, [x5], x3          // load pix3
             ld1             {v20.16b}, [x1], x3         // load pix1

             rshrn           v26.8b, v26.8h, #2          // shift right 2 0..7 (rounding shift right)
    @@ -140,33 +141,33 @@ function ff_pix_abs16_xy2_neon, export=1
             rshrn           v28.8b, v28.8h, #2          // shift right 2 0..7 (rounding shift right)
             rshrn2          v28.16b, v29.8h, #2         // shift right 2 8..15

    -        uabal           v24.8h, v16.8b,  v26.8b     // absolute difference 0..7, i=1
    -        uabal2          v23.8h, v16.16b, v26.16b    // absolute difference 8..15, i=1
    +        uabal           v21.8h, v16.8b,  v26.8b     // absolute difference 0..7, i=1
    +        uabal2          v22.8h, v16.16b, v26.16b    // absolute difference 8..15, i=1

    -        uaddl           v2.8h, v21.8b, v22.8b       // pix3 + pix3+1 0..7
    -        uaddl2          v3.8h, v21.16b, v22.16b     // pix3 + pix3+1 8..15
    +        uaddl           v2.8h, v6.8b,  v7.8b        // pix3 + pix3+1 0..7
    +        uaddl2          v3.8h, v6.16b, v7.16b       // pix3 + pix3+1 8..15
             add             v30.8h, v4.8h, v2.8h        // add up 0..7, using pix2 + pix2+1 values from pix3 above
             add             v31.8h, v5.8h, v3.8h        // add up 8..15, using pix2 + pix2+1 values from pix3 above
             rshrn           v30.8b, v30.8h, #2          // shift right 2 0..7 (rounding shift right)
             rshrn2          v30.16b, v31.8h, #2         // shift right 2 8..15

    -        uabal           v24.8h, v17.8b,  v28.8b     // absolute difference 0..7, i=2
    -        uabal2          v23.8h, v17.16b, v28.16b    // absolute difference 8..15, i=2
    -
             sub             w4, w4, #4                  // h -= 4

    -        uabal           v24.8h, v20.8b,  v30.8b     // absolute difference 0..7, i=3
    -        uabal2          v23.8h, v20.16b, v30.16b    // absolute difference 8..15, i=3
    +        uabal           v21.8h, v17.8b,  v28.8b     // absolute difference 0..7, i=2
    +        uabal2          v22.8h, v17.16b, v28.16b    // absolute difference 8..15, i=2

             cmp             w4, #4                      // loop if h >= 4

    -        add             v4.8h, v23.8h, v24.8h
    -        uaddlv          s4, v4.8h                   // finish adding up accumulated values
    -        add             d0, d0, d4                  // add the value to the top level accumulator
    +
    +        uabal           v21.8h, v20.8b,  v30.8b     // absolute difference 0..7, i=3
    +        uabal2          v22.8h, v20.16b, v30.16b    // absolute difference 8..15, i=3

             b.ge            1b
             cbnz            w4, 2f                      // if iterations remain jump to completion section

    +        add             v4.8h, v21.8h, v22.8h
    +        uaddlv          s0, v4.8h                   // finish adding up accumulated values
    +
             fmov            w0, s0                      // copy result to general purpose register
             ret
     2:
    @@ -182,20 +183,18 @@ function ff_pix_abs16_xy2_neon, export=1
             add             v16.8h, v2.8h, v18.8h       // add up 0..7, using pix2 + pix2+1 values from previous iteration
             add             v17.8h, v3.8h, v19.8h       // add up 8..15, using pix2 + pix2+1 values from previous iteration
             // divide by 4 to compute the average of values summed above
    -        urshr           v16.8h, v16.8h, #2          // shift right by 2 0..7 (rounding shift right)
    -        urshr           v17.8h, v17.8h, #2          // shift right by 2 8..15
    -
    -        uxtl2           v7.8h, v1.16b               // 8->16 bits pix1 8..15
    -        uxtl            v1.8h, v1.8b                // 8->16 bits pix1 0..7
    +        rshrn           v16.8b,  v16.8h, #2         // shift right by 2 0..7 (rounding shift right)
    +        rshrn2          v16.16b, v17.8h, #2         // shift right by 2 8..15

    -        uabd            v6.8h, v1.8h, v16.8h        // absolute difference 0..7
    -        uaba            v6.8h, v7.8h, v17.8h        // absolute difference accumulate 8..15
    +        uabal           v21.8h, v1.8b,  v16.8b      // absolute difference 0..7
    +        uabal2          v22.8h, v1.16b, v16.16b     // absolute difference accumulate 8..15
             mov             v2.16b, v18.16b             // pix3 -> pix2
             mov             v3.16b, v19.16b             // pix3+1 -> pix2+1
    -        uaddlv          s6, v6.8h                   // add up accumulator in v6
    -        add             d0, d0, d6                  // add to the final result

             b.ne            2b                          // loop if h > 0
    +
    +        add             v4.8h, v21.8h, v22.8h
    +        uaddlv          s0, v4.8h                   // finish adding up accumulated values
             fmov            w0, s0                      // copy result to general purpose register
             ret
     endfunc
    @@ -209,7 +208,8 @@ function ff_pix_abs16_x2_neon, export=1

             cmp             w4, #4
             // initialize buffers
    -        movi            d20, #0
    +        movi            v16.8h, #0
    +        movi            v17.8h, #0
             add             x5, x2, #1 // pix2 + 1
             b.lt            2f

    @@ -224,9 +224,9 @@ function ff_pix_abs16_x2_neon, export=1
             ld1             {v2.16b}, [x5], x3
             urhadd          v30.16b, v1.16b, v2.16b
             ld1             {v0.16b}, [x1], x3
    -        uabdl           v16.8h, v0.8b, v30.8b
    +        uabal           v16.8h, v0.8b, v30.8b
             ld1             {v4.16b}, [x2], x3
    -        uabdl2          v17.8h, v0.16b, v30.16b
    +        uabal2          v17.8h, v0.16b, v30.16b
             ld1             {v5.16b}, [x5], x3
             urhadd          v29.16b, v4.16b, v5.16b
             ld1             {v3.16b}, [x1], x3
    @@ -238,20 +238,15 @@ function ff_pix_abs16_x2_neon, export=1
             ld1             {v6.16b}, [x1], x3
             uabal           v16.8h, v6.8b, v28.8b
             ld1             {v24.16b}, [x2], x3
    +        sub             w4, w4, #4
             uabal2          v17.8h, v6.16b, v28.16b
             ld1             {v25.16b}, [x5], x3
             urhadd          v27.16b, v24.16b, v25.16b
             ld1             {v23.16b}, [x1], x3
    +        cmp             w4, #4
             uabal           v16.8h, v23.8b, v27.8b
             uabal2          v17.8h, v23.16b, v27.16b

    -        sub             w4, w4, #4
    -
    -        add             v16.8h, v16.8h, v17.8h
    -        uaddlv          s16, v16.8h
    -        cmp             w4, #4
    -        add             d20, d20, d16
    -
             b.ge            1b
             cbz             w4, 3f

    @@ -259,18 +254,19 @@ function ff_pix_abs16_x2_neon, export=1
     2:
             ld1             {v1.16b}, [x2], x3
             ld1             {v2.16b}, [x5], x3
    +        subs            w4, w4, #1
             urhadd          v29.16b, v1.16b, v2.16b
             ld1             {v0.16b}, [x1], x3
    -        uabd            v28.16b, v0.16b, v29.16b
    +        uabal           v16.8h, v0.8b,  v29.8b
    +        uabal2          v17.8h, v0.16b, v29.16b

    -        uaddlv          h28, v28.16b
    -        subs            w4, w4, #1

    -        add             d20, d20, d28
             b.ne            2b

     3:
    -        fmov            w0, s20
    +        add             v16.8h, v16.8h, v17.8h
    +        uaddlv          s16, v16.8h
    +        fmov            w0, s16

             ret
     endfunc
    --
    2.25.1
Martin Storsjö July 15, 2022, 7:56 p.m. UTC | #2
On Fri, 15 Jul 2022, Swinney, Jonathan wrote:

> If the max height is just 16, then this should be fine. I assumed that h 
> could have a much higher value (>1024), but if that is not the case, 
> then this is a useful optimization.

At least according to the me_cmp.h header, which says:

/* Motion estimation:
  * h is limited to { width / 2, width, 2 * width },
  * but never larger than 16 and never smaller than 2.
  * Although currently h < 4 is not used as functions with
  * width < 8 are neither used nor implemented. */

So with that in mind, I think this should be safe to do.

// Martin
Michael Niedermayer July 15, 2022, 9:19 p.m. UTC | #3
On Fri, Jul 15, 2022 at 10:56:03PM +0300, Martin Storsjö wrote:
> On Fri, 15 Jul 2022, Swinney, Jonathan wrote:
> 
> > If the max height is just 16, then this should be fine. I assumed that h
> > could have a much higher value (>1024), but if that is not the case,
> > then this is a useful optimization.
> 
> At least according to the me_cmp.h header, which says:
> 
> /* Motion estimation:
>  * h is limited to { width / 2, width, 2 * width },
>  * but never larger than 16 and never smaller than 2.
>  * Although currently h < 4 is not used as functions with
>  * width < 8 are neither used nor implemented. */

These rules where written with support for encoding of all
standard formats in mind at the time that was written.
today it may make sense to extend these rules to cover the
things which where created since then

thx

[...]
Martin Storsjö July 15, 2022, 9:25 p.m. UTC | #4
On Fri, 15 Jul 2022, Michael Niedermayer wrote:

> On Fri, Jul 15, 2022 at 10:56:03PM +0300, Martin Storsjö wrote:
>> On Fri, 15 Jul 2022, Swinney, Jonathan wrote:
>>
>>> If the max height is just 16, then this should be fine. I assumed that h
>>> could have a much higher value (>1024), but if that is not the case,
>>> then this is a useful optimization.
>>
>> At least according to the me_cmp.h header, which says:
>>
>> /* Motion estimation:
>>  * h is limited to { width / 2, width, 2 * width },
>>  * but never larger than 16 and never smaller than 2.
>>  * Although currently h < 4 is not used as functions with
>>  * width < 8 are neither used nor implemented. */
>
> These rules where written with support for encoding of all
> standard formats in mind at the time that was written.
> today it may make sense to extend these rules to cover the
> things which where created since then

Right, but if that suddenly changes, such a change also must expect that 
it might need updates to all assembly implementations that implement that 
interface currently. Right now, both the defacto case (any callers in the 
codebase) and the explicit documentation says that it can't be called with 
parameters outside of that range.

Even if it's raised from the current <= 16, this particular optimization 
should be fine as long as h <= 256 - which should be fine for at least all 
current-gen mainstream codecs since, I think?

// Martin
Martin Storsjö July 16, 2022, 9:18 a.m. UTC | #5
On Fri, 15 Jul 2022, Michael Niedermayer wrote:

> On Fri, Jul 15, 2022 at 10:56:03PM +0300, Martin Storsjö wrote:
>> On Fri, 15 Jul 2022, Swinney, Jonathan wrote:
>>
>>> If the max height is just 16, then this should be fine. I assumed that h
>>> could have a much higher value (>1024), but if that is not the case,
>>> then this is a useful optimization.
>>
>> At least according to the me_cmp.h header, which says:
>>
>> /* Motion estimation:
>>  * h is limited to { width / 2, width, 2 * width },
>>  * but never larger than 16 and never smaller than 2.
>>  * Although currently h < 4 is not used as functions with
>>  * width < 8 are neither used nor implemented. */
>
> These rules where written with support for encoding of all
> standard formats in mind at the time that was written.
> today it may make sense to extend these rules to cover the
> things which where created since then

Also - if extending this, I would expect that you want other widths too. 
Right now, most of the functions seem to be arranged such as [0] is w=16 
and [0] is w=8. For those, for w=8, it seems to be mostly hardcoded to 
only assume h=8, while the w=16 functions actually honor the h parameter.

If it ever would be relevant with h>256, that wouldn't be for the existing 
w=8 or w=16 functions, but for newer functions with a larger width too.

So I think this patch is safe (which works for h up to 256), and if 
someone wants to extend the interface later, that can be done.

// Martin
Michael Niedermayer July 16, 2022, 11:23 a.m. UTC | #6
On Sat, Jul 16, 2022 at 12:25:37AM +0300, Martin Storsjö wrote:
> On Fri, 15 Jul 2022, Michael Niedermayer wrote:
> 
> > On Fri, Jul 15, 2022 at 10:56:03PM +0300, Martin Storsjö wrote:
> > > On Fri, 15 Jul 2022, Swinney, Jonathan wrote:
> > > 
> > > > If the max height is just 16, then this should be fine. I assumed that h
> > > > could have a much higher value (>1024), but if that is not the case,
> > > > then this is a useful optimization.
> > > 
> > > At least according to the me_cmp.h header, which says:
> > > 
> > > /* Motion estimation:
> > >  * h is limited to { width / 2, width, 2 * width },
> > >  * but never larger than 16 and never smaller than 2.
> > >  * Although currently h < 4 is not used as functions with
> > >  * width < 8 are neither used nor implemented. */
> > 
> > These rules where written with support for encoding of all
> > standard formats in mind at the time that was written.
> > today it may make sense to extend these rules to cover the
> > things which where created since then
> 
> Right, but if that suddenly changes, such a change also must expect that it
> might need updates to all assembly implementations that implement that
> interface currently. Right now, both the defacto case (any callers in the
> codebase) and the explicit documentation says that it can't be called with
> parameters outside of that range.

What i meant was that newly added functions should be more flexible than
these old rules. That is 2 sets of rules
1. What a caller ATM can do and expect to work (thats whats written there)
2. What an implementor of new functions should make sure is supported


> 
> Even if it's raised from the current <= 16, this particular optimization
> should be fine as long as h <= 256 - which should be fine for at least all
> current-gen mainstream codecs since, I think?

[...]
Martin Storsjö July 16, 2022, 12:30 p.m. UTC | #7
On Sat, 16 Jul 2022, Michael Niedermayer wrote:

> On Sat, Jul 16, 2022 at 12:25:37AM +0300, Martin Storsjö wrote:
>> On Fri, 15 Jul 2022, Michael Niedermayer wrote:
>>
>>> On Fri, Jul 15, 2022 at 10:56:03PM +0300, Martin Storsjö wrote:
>>>> On Fri, 15 Jul 2022, Swinney, Jonathan wrote:
>>>>
>>>>> If the max height is just 16, then this should be fine. I assumed that h
>>>>> could have a much higher value (>1024), but if that is not the case,
>>>>> then this is a useful optimization.
>>>>
>>>> At least according to the me_cmp.h header, which says:
>>>>
>>>> /* Motion estimation:
>>>>  * h is limited to { width / 2, width, 2 * width },
>>>>  * but never larger than 16 and never smaller than 2.
>>>>  * Although currently h < 4 is not used as functions with
>>>>  * width < 8 are neither used nor implemented. */
>>>
>>> These rules where written with support for encoding of all
>>> standard formats in mind at the time that was written.
>>> today it may make sense to extend these rules to cover the
>>> things which where created since then
>>
>> Right, but if that suddenly changes, such a change also must expect that it
>> might need updates to all assembly implementations that implement that
>> interface currently. Right now, both the defacto case (any callers in the
>> codebase) and the explicit documentation says that it can't be called with
>> parameters outside of that range.
>
> What i meant was that newly added functions should be more flexible than
> these old rules. That is 2 sets of rules
> 1. What a caller ATM can do and expect to work (thats whats written there)
> 2. What an implementor of new functions should make sure is supported

With 2., do you mean if adding a new function into the same struct, or if 
implementing the existing pix_abs[0][..] functions?

If you mean new implementations of the existing function interface, you 
say they "should be more flexible". How flexible must they be? Is it ok to 
assume h<=256 for the w=16 functions?

Gradually increasing the requirements for existing function interfaces 
like you suggest is really problematic.

If we want to require more of the functions, we should document it, and 
extend the checkasm test to test that new requirement - which also extends 
the requirement to the existing functions. If we don't have a checkasm 
test for the required behaviour, we can pretty much assume it's broken, 
even in new implementations.

// Martin
Ronald S. Bultje July 16, 2022, 12:50 p.m. UTC | #8
Hi,

On Sat, Jul 16, 2022 at 7:23 PM Michael Niedermayer <michael@niedermayer.cc>
wrote:

> What i meant was that newly added functions should be more flexible than
> these old rules. That is 2 sets of rules
> 1. What a caller ATM can do and expect to work (thats whats written there)
> 2. What an implementor of new functions should make sure is supported
>

What's the use case exactly that you'd like to support that we currently
don't?

Ronald
Michael Niedermayer July 16, 2022, 1:06 p.m. UTC | #9
On Sat, Jul 16, 2022 at 08:50:24PM +0800, Ronald S. Bultje wrote:
> Hi,
> 
> On Sat, Jul 16, 2022 at 7:23 PM Michael Niedermayer <michael@niedermayer.cc>
> wrote:
> 
> > What i meant was that newly added functions should be more flexible than
> > these old rules. That is 2 sets of rules
> > 1. What a caller ATM can do and expect to work (thats whats written there)
> > 2. What an implementor of new functions should make sure is supported
> >
> 
> What's the use case exactly that you'd like to support that we currently
> don't?

The idea is to have general purpose functions which can be used if someone
wants to write a new encoder. 
(this is kind of not a different goal, its rather that the original
 limitations where based on mpeg2/mpeg4/h264 which was the most flexible at
 the time)
 
thx


[...]
Michael Niedermayer July 16, 2022, 1:20 p.m. UTC | #10
On Sat, Jul 16, 2022 at 03:30:23PM +0300, Martin Storsjö wrote:
> On Sat, 16 Jul 2022, Michael Niedermayer wrote:
> 
> > On Sat, Jul 16, 2022 at 12:25:37AM +0300, Martin Storsjö wrote:
> > > On Fri, 15 Jul 2022, Michael Niedermayer wrote:
> > > 
> > > > On Fri, Jul 15, 2022 at 10:56:03PM +0300, Martin Storsjö wrote:
> > > > > On Fri, 15 Jul 2022, Swinney, Jonathan wrote:
> > > > > 
> > > > > > If the max height is just 16, then this should be fine. I assumed that h
> > > > > > could have a much higher value (>1024), but if that is not the case,
> > > > > > then this is a useful optimization.
> > > > > 
> > > > > At least according to the me_cmp.h header, which says:
> > > > > 
> > > > > /* Motion estimation:
> > > > >  * h is limited to { width / 2, width, 2 * width },
> > > > >  * but never larger than 16 and never smaller than 2.
> > > > >  * Although currently h < 4 is not used as functions with
> > > > >  * width < 8 are neither used nor implemented. */
> > > > 
> > > > These rules where written with support for encoding of all
> > > > standard formats in mind at the time that was written.
> > > > today it may make sense to extend these rules to cover the
> > > > things which where created since then
> > > 
> > > Right, but if that suddenly changes, such a change also must expect that it
> > > might need updates to all assembly implementations that implement that
> > > interface currently. Right now, both the defacto case (any callers in the
> > > codebase) and the explicit documentation says that it can't be called with
> > > parameters outside of that range.
> > 
> > What i meant was that newly added functions should be more flexible than
> > these old rules. That is 2 sets of rules
> > 1. What a caller ATM can do and expect to work (thats whats written there)
> > 2. What an implementor of new functions should make sure is supported
> 
> With 2., do you mean if adding a new function into the same struct, or if
> implementing the existing pix_abs[0][..] functions?

i would say both


> 
> If you mean new implementations of the existing function interface, you say
> they "should be more flexible". How flexible must they be? Is it ok to
> assume h<=256 for the w=16 functions?

i think thats fine


> 
> Gradually increasing the requirements for existing function interfaces like
> you suggest is really problematic.

why ?
iam really just saying
"when you add new code, dont base it on old limitations"

And i suggest this because its much easier to do when a function is added than
when later one wants to use it 

When i originally wrote this list of restrictions it didnt list what our
code actually used but tried to look ahead to what we would need when we
write encoders for all standard formats at that time. We never wrote a
h264 encoder but this API was designed to support it.


> 
> If we want to require more of the functions, we should document it, and
> extend the checkasm test to test that new requirement - which also extends
> the requirement to the existing functions. If we don't have a checkasm test
> for the required behaviour, we can pretty much assume it's broken, even in
> new implementations.

yes

[...]
Martin Storsjö July 16, 2022, 2:23 p.m. UTC | #11
On Sat, 16 Jul 2022, Michael Niedermayer wrote:

> On Sat, Jul 16, 2022 at 03:30:23PM +0300, Martin Storsjö wrote:
>> On Sat, 16 Jul 2022, Michael Niedermayer wrote:
>>
>>> On Sat, Jul 16, 2022 at 12:25:37AM +0300, Martin Storsjö wrote:
>>>> On Fri, 15 Jul 2022, Michael Niedermayer wrote:
>>>>
>>>>> On Fri, Jul 15, 2022 at 10:56:03PM +0300, Martin Storsjö wrote:
>>>>>> On Fri, 15 Jul 2022, Swinney, Jonathan wrote:
>>>>>>
>>>>>>> If the max height is just 16, then this should be fine. I assumed that h
>>>>>>> could have a much higher value (>1024), but if that is not the case,
>>>>>>> then this is a useful optimization.
>>>>>>
>>>>>> At least according to the me_cmp.h header, which says:
>>>>>>
>>>>>> /* Motion estimation:
>>>>>>  * h is limited to { width / 2, width, 2 * width },
>>>>>>  * but never larger than 16 and never smaller than 2.
>>>>>>  * Although currently h < 4 is not used as functions with
>>>>>>  * width < 8 are neither used nor implemented. */
>>>>>
>>>>> These rules where written with support for encoding of all
>>>>> standard formats in mind at the time that was written.
>>>>> today it may make sense to extend these rules to cover the
>>>>> things which where created since then
>>>>
>>>> Right, but if that suddenly changes, such a change also must expect that it
>>>> might need updates to all assembly implementations that implement that
>>>> interface currently. Right now, both the defacto case (any callers in the
>>>> codebase) and the explicit documentation says that it can't be called with
>>>> parameters outside of that range.
>>>
>>> What i meant was that newly added functions should be more flexible than
>>> these old rules. That is 2 sets of rules
>>> 1. What a caller ATM can do and expect to work (thats whats written there)
>>> 2. What an implementor of new functions should make sure is supported
>>
>> With 2., do you mean if adding a new function into the same struct, or if
>> implementing the existing pix_abs[0][..] functions?
>
> i would say both
>
>
>>
>> If you mean new implementations of the existing function interface, you say
>> they "should be more flexible". How flexible must they be? Is it ok to
>> assume h<=256 for the w=16 functions?
>
> i think thats fine

Ok, I'll go ahead and push this then.

>>
>> Gradually increasing the requirements for existing function interfaces like
>> you suggest is really problematic.
>
> why ?
> iam really just saying
> "when you add new code, dont base it on old limitations"

For this case, I just quoted what the header said, which seemed 
authoritative to me - but it's fine for me with a wider spec too, up to 
h=256. But saying arbitrarily "any height" really inhibits what you can do 
in asm.

Also if I shouldn't reference that limitation in the header, please 
update/reword it, as it's actively misleading for anyone working on 
this right now.

And in general, we can design for an intended use case and calculate 
whether it should work or not - but as long as it's not tested, those 
cases will often have hidden bugs - see e.g. patch 1/5 in this series.

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 89546869fb..cda7ce0408 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -27,15 +27,16 @@  function ff_pix_abs16_neon, export=1
         // x3           ptrdiff_t stride
         // w4           int h
         cmp             w4, #4                      // if h < 4, jump to completion section
-        movi            v18.4S, #0                  // clear result accumulator
+        movi            v16.8h, #0                  // clear result accumulator
+        movi            v17.8h, #0                  // clear result accumulator
         b.lt            2f
 1:
         ld1             {v0.16b}, [x1], x3          // load pix1
         ld1             {v4.16b}, [x2], x3          // load pix2
         ld1             {v1.16b}, [x1], x3          // load pix1
         ld1             {v5.16b}, [x2], x3          // load pix2
-        uabdl           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
-        uabdl2          v17.8h, v0.16b, v4.16b
+        uabal           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
+        uabal2          v17.8h, v0.16b, v4.16b
         ld1             {v2.16b}, [x1], x3          // load pix1
         ld1             {v6.16b}, [x2], x3          // load pix2
         uabal           v16.8h, v1.8b, v5.8b        // absolute difference accumulate
@@ -48,27 +49,26 @@  function ff_pix_abs16_neon, export=1
         uabal           v16.8h, v3.8b, v7.8b
         uabal2          v17.8h, v3.16b, v7.16b
         cmp             w4, #4                      // if h >= 4, loop
-        add             v16.8h, v16.8h, v17.8h
-        uaddlv          s16, v16.8h                 // add up everything in v16 accumulator
-        add             d18, d16, d18               // add to the end result register
 
         b.ge            1b
         cbnz            w4, 2f                      // if iterations remain, jump to completion section
 
-        fmov            w0, s18                     // copy result to general purpose register
+        add             v16.8h, v16.8h, v17.8h
+        uaddlv          s16, v16.8h                 // add up everything in v16 accumulator
+        fmov            w0, s16                     // copy result to general purpose register
         ret
 
 2:
         ld1             {v0.16b}, [x1], x3          // load pix1
         ld1             {v4.16b}, [x2], x3          // load pix2
-        uabdl           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
-        uabal2          v16.8h, v0.16b, v4.16b
         subs            w4, w4, #1                  // h -= 1
-        addv            h16, v16.8h                 // add up v16
-        add             d18, d16, d18               // add to result
+        uabal           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
+        uabal2          v17.8h, v0.16b, v4.16b
         b.ne            2b
 
-        fmov            w0, s18                     // copy result to general purpose register
+        add             v16.8h, v16.8h, v17.8h
+        uaddlv          s16, v16.8h                 // add up everything in v16 accumulator
+        fmov            w0, s16                     // copy result to general purpose register
         ret
 endfunc
 
@@ -80,7 +80,8 @@  function ff_pix_abs16_xy2_neon, export=1
         // w4           int h
 
         add             x5, x2, x3                  // use x5 to hold uint8_t *pix3
-        movi            v0.2d, #0                   // initialize the result register
+        movi            v21.8h, #0                  // initialize the result register
+        movi            v22.8h, #0                  // initialize the result register
 
         // Load initial pix2 values for either the unrolled version or completion version.
         ldur            q4, [x2, #1]                // load pix2+1
@@ -119,15 +120,15 @@  function ff_pix_abs16_xy2_neon, export=1
         uaddl           v2.8h, v6.8b, v7.8b         // pix3 + pix3+1 0..7
         uaddl2          v3.8h, v6.16b, v7.16b       // pix3 + pix3+1 8..15
 
-        ldur            q22, [x5, #1]               // load pix3+1
+        ldur            q7, [x5, #1]                // load pix3+1
 
         add             v26.8h, v30.8h, v2.8h       // add up 0..7, using pix2 + pix2+1 values from pix3 above
         add             v27.8h, v31.8h, v3.8h       // add up 8..15, using pix2 + pix2+1 values from pix3 above
 
-        uabdl           v24.8h, v1.8b,  v23.8b      // absolute difference 0..7, i=0
-        uabdl2          v23.8h, v1.16b, v23.16b     // absolute difference 8..15, i=0
+        uabal           v21.8h, v1.8b,  v23.8b      // absolute difference 0..7, i=0
+        uabal2          v22.8h, v1.16b, v23.16b     // absolute difference 8..15, i=0
 
-        ld1             {v21.16b}, [x5], x3         // load pix3
+        ld1             {v6.16b}, [x5], x3          // load pix3
         ld1             {v20.16b}, [x1], x3         // load pix1
 
         rshrn           v26.8b, v26.8h, #2          // shift right 2 0..7 (rounding shift right)
@@ -140,33 +141,33 @@  function ff_pix_abs16_xy2_neon, export=1
         rshrn           v28.8b, v28.8h, #2          // shift right 2 0..7 (rounding shift right)
         rshrn2          v28.16b, v29.8h, #2         // shift right 2 8..15
 
-        uabal           v24.8h, v16.8b,  v26.8b     // absolute difference 0..7, i=1
-        uabal2          v23.8h, v16.16b, v26.16b    // absolute difference 8..15, i=1
+        uabal           v21.8h, v16.8b,  v26.8b     // absolute difference 0..7, i=1
+        uabal2          v22.8h, v16.16b, v26.16b    // absolute difference 8..15, i=1
 
-        uaddl           v2.8h, v21.8b, v22.8b       // pix3 + pix3+1 0..7
-        uaddl2          v3.8h, v21.16b, v22.16b     // pix3 + pix3+1 8..15
+        uaddl           v2.8h, v6.8b,  v7.8b        // pix3 + pix3+1 0..7
+        uaddl2          v3.8h, v6.16b, v7.16b       // pix3 + pix3+1 8..15
         add             v30.8h, v4.8h, v2.8h        // add up 0..7, using pix2 + pix2+1 values from pix3 above
         add             v31.8h, v5.8h, v3.8h        // add up 8..15, using pix2 + pix2+1 values from pix3 above
         rshrn           v30.8b, v30.8h, #2          // shift right 2 0..7 (rounding shift right)
         rshrn2          v30.16b, v31.8h, #2         // shift right 2 8..15
 
-        uabal           v24.8h, v17.8b,  v28.8b     // absolute difference 0..7, i=2
-        uabal2          v23.8h, v17.16b, v28.16b    // absolute difference 8..15, i=2
-
         sub             w4, w4, #4                  // h -= 4
 
-        uabal           v24.8h, v20.8b,  v30.8b     // absolute difference 0..7, i=3
-        uabal2          v23.8h, v20.16b, v30.16b    // absolute difference 8..15, i=3
+        uabal           v21.8h, v17.8b,  v28.8b     // absolute difference 0..7, i=2
+        uabal2          v22.8h, v17.16b, v28.16b    // absolute difference 8..15, i=2
 
         cmp             w4, #4                      // loop if h >= 4
 
-        add             v4.8h, v23.8h, v24.8h
-        uaddlv          s4, v4.8h                   // finish adding up accumulated values
-        add             d0, d0, d4                  // add the value to the top level accumulator
+
+        uabal           v21.8h, v20.8b,  v30.8b     // absolute difference 0..7, i=3
+        uabal2          v22.8h, v20.16b, v30.16b    // absolute difference 8..15, i=3
 
         b.ge            1b
         cbnz            w4, 2f                      // if iterations remain jump to completion section
 
+        add             v4.8h, v21.8h, v22.8h
+        uaddlv          s0, v4.8h                   // finish adding up accumulated values
+
         fmov            w0, s0                      // copy result to general purpose register
         ret
 2:
@@ -182,20 +183,18 @@  function ff_pix_abs16_xy2_neon, export=1
         add             v16.8h, v2.8h, v18.8h       // add up 0..7, using pix2 + pix2+1 values from previous iteration
         add             v17.8h, v3.8h, v19.8h       // add up 8..15, using pix2 + pix2+1 values from previous iteration
         // divide by 4 to compute the average of values summed above
-        urshr           v16.8h, v16.8h, #2          // shift right by 2 0..7 (rounding shift right)
-        urshr           v17.8h, v17.8h, #2          // shift right by 2 8..15
-
-        uxtl2           v7.8h, v1.16b               // 8->16 bits pix1 8..15
-        uxtl            v1.8h, v1.8b                // 8->16 bits pix1 0..7
+        rshrn           v16.8b,  v16.8h, #2         // shift right by 2 0..7 (rounding shift right)
+        rshrn2          v16.16b, v17.8h, #2         // shift right by 2 8..15
 
-        uabd            v6.8h, v1.8h, v16.8h        // absolute difference 0..7
-        uaba            v6.8h, v7.8h, v17.8h        // absolute difference accumulate 8..15
+        uabal           v21.8h, v1.8b,  v16.8b      // absolute difference 0..7
+        uabal2          v22.8h, v1.16b, v16.16b     // absolute difference accumulate 8..15
         mov             v2.16b, v18.16b             // pix3 -> pix2
         mov             v3.16b, v19.16b             // pix3+1 -> pix2+1
-        uaddlv          s6, v6.8h                   // add up accumulator in v6
-        add             d0, d0, d6                  // add to the final result
 
         b.ne            2b                          // loop if h > 0
+
+        add             v4.8h, v21.8h, v22.8h
+        uaddlv          s0, v4.8h                   // finish adding up accumulated values
         fmov            w0, s0                      // copy result to general purpose register
         ret
 endfunc
@@ -209,7 +208,8 @@  function ff_pix_abs16_x2_neon, export=1
 
         cmp             w4, #4
         // initialize buffers
-        movi            d20, #0
+        movi            v16.8h, #0
+        movi            v17.8h, #0
         add             x5, x2, #1 // pix2 + 1
         b.lt            2f
 
@@ -224,9 +224,9 @@  function ff_pix_abs16_x2_neon, export=1
         ld1             {v2.16b}, [x5], x3
         urhadd          v30.16b, v1.16b, v2.16b
         ld1             {v0.16b}, [x1], x3
-        uabdl           v16.8h, v0.8b, v30.8b
+        uabal           v16.8h, v0.8b, v30.8b
         ld1             {v4.16b}, [x2], x3
-        uabdl2          v17.8h, v0.16b, v30.16b
+        uabal2          v17.8h, v0.16b, v30.16b
         ld1             {v5.16b}, [x5], x3
         urhadd          v29.16b, v4.16b, v5.16b
         ld1             {v3.16b}, [x1], x3
@@ -238,20 +238,15 @@  function ff_pix_abs16_x2_neon, export=1
         ld1             {v6.16b}, [x1], x3
         uabal           v16.8h, v6.8b, v28.8b
         ld1             {v24.16b}, [x2], x3
+        sub             w4, w4, #4
         uabal2          v17.8h, v6.16b, v28.16b
         ld1             {v25.16b}, [x5], x3
         urhadd          v27.16b, v24.16b, v25.16b
         ld1             {v23.16b}, [x1], x3
+        cmp             w4, #4
         uabal           v16.8h, v23.8b, v27.8b
         uabal2          v17.8h, v23.16b, v27.16b
 
-        sub             w4, w4, #4
-
-        add             v16.8h, v16.8h, v17.8h
-        uaddlv          s16, v16.8h
-        cmp             w4, #4
-        add             d20, d20, d16
-
         b.ge            1b
         cbz             w4, 3f
 
@@ -259,18 +254,19 @@  function ff_pix_abs16_x2_neon, export=1
 2:
         ld1             {v1.16b}, [x2], x3
         ld1             {v2.16b}, [x5], x3
+        subs            w4, w4, #1
         urhadd          v29.16b, v1.16b, v2.16b
         ld1             {v0.16b}, [x1], x3
-        uabd            v28.16b, v0.16b, v29.16b
+        uabal           v16.8h, v0.8b,  v29.8b
+        uabal2          v17.8h, v0.16b, v29.16b
 
-        uaddlv          h28, v28.16b
-        subs            w4, w4, #1
 
-        add             d20, d20, d28
         b.ne            2b
 
 3:
-        fmov            w0, s20
+        add             v16.8h, v16.8h, v17.8h
+        uaddlv          s16, v16.8h
+        fmov            w0, s16
 
         ret
 endfunc