diff mbox series

[FFmpeg-devel] lavc/aarch64: Add neon implementation for sse4

Message ID 20220725111541.44618-1-hum@semihalf.com
State New
Headers show
Series [FFmpeg-devel] lavc/aarch64: Add neon implementation for sse4 | expand

Checks

Context Check Description
yinshiyou/configure_loongarch64 warning Failed to apply patch
andriy/configure_x86 warning Failed to apply patch

Commit Message

Hubert Mazur July 25, 2022, 11:15 a.m. UTC
Provide neon implementation for sse4 function.

Performance comparison tests are shown below.
- sse_2_c: 74.0
- sse_2_neon: 24.0

Benchmarks and tests are run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
 libavcodec/aarch64/me_cmp_init_aarch64.c |  3 ++
 libavcodec/aarch64/me_cmp_neon.S         | 65 ++++++++++++++++++++++++
 2 files changed, 68 insertions(+)

Comments

Swinney, Jonathan July 28, 2022, 6:50 p.m. UTC | #1
Your latest set of patches didn’t get interpreted correctly by the patchwork tool. I suspect it took them in the wrong order. 

https://patchwork.ffmpeg.org/project/ffmpeg/list/?series=&submitter=1479&state=&q=&archive=&delegate=

There is one more place to move the sub, cmp and branch instructions apart in sse16_neon. It doesn't seem to make any difference to Neoverse N1 and V1 and it may help A53.

I didn't see anything else.

Thanks!
-- 

Jonathan Swinney

On 7/25/22, 6:16 AM, "Hubert Mazur" <hum@semihalf.com> wrote:

    CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.



    Provide neon implementation for sse4 function.

    Performance comparison tests are shown below.
    - sse_2_c: 74.0
    - sse_2_neon: 24.0

    Benchmarks and tests are run with checkasm tool on AWS Graviton 3.

    Signed-off-by: Hubert Mazur <hum@semihalf.com>
    ---
     libavcodec/aarch64/me_cmp_init_aarch64.c |  3 ++
     libavcodec/aarch64/me_cmp_neon.S         | 65 ++++++++++++++++++++++++
     2 files changed, 68 insertions(+)

    diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
    index 3ff5767bd0..72a2062e7e 100644
    --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
    +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
    @@ -32,6 +32,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,

     int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                       ptrdiff_t stride, int h);
    +int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
    +                  ptrdiff_t stride, int h);

     av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
     {
    @@ -44,5 +46,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)

             c->sad[0] = ff_pix_abs16_neon;
             c->sse[0] = sse16_neon;
    +        c->sse[2] = sse4_neon;
         }
     }
    diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
    index 98c912b608..3336d88848 100644
    --- a/libavcodec/aarch64/me_cmp_neon.S
    +++ b/libavcodec/aarch64/me_cmp_neon.S
    @@ -352,3 +352,68 @@ function sse16_neon, export=1
             ret

     endfunc
    +
    +function sse4_neon, export=1
    +        // x0 - unused
    +        // x1 - pix1
    +        // x2 - pix2
    +        // x3 - stride
    +        // w4 - h
    +
    +        movi            d18, #0
    +        movi            d17, #0
    +        cmp             w4, #4
    +        b.le            2f
    +
    +// make 4 iterations at once
    +1:
    +
    +        // res = abs(pix1[0] - pix2[0])
    +        // res * res
    +
    +        ld1             {v0.s}[0], [x1], x3             // Load pix1, first iteration
    +        ld1             {v1.s}[0], [x2], x3             // Load pix2, first iteration
    +        uabdl           v30.8h, v0.8b, v1.8b            // Absolute difference, first iteration
    +        ld1             {v2.s}[0], [x1], x3             // Load pix1, second iteration
    +        ld1             {v3.s}[0], [x2], x3             // Load pix2, second iteration
    +        umull           v16.4s, v30.4h, v30.4h          // Multiply vectors, first iteration
    +        uabdl           v29.8h, v2.8b, v3.8b            // Absolute difference, second iteration
    +        ld1             {v4.s}[0], [x1], x3             // Load pix1, third iteration
    +        ld1             {v5.s}[0], [x2], x3             // Load pix2, third iteration
    +        umlal           v16.4s, v29.4h, v29.4h          // Multiply and accumulate, second iteration
    +        uabdl           v28.8h, v4.8b, v5.8b            // Absolute difference, third iteration
    +        ld1             {v6.s}[0], [x1], x3             // Load pix1, fourth iteration
    +        ld1             {v7.s}[0], [x2], x3             // Load pix2, fourth iteration
    +        umlal           v16.4s, v28.4h, v28.4h          // Multiply and accumulate, third iteration
    +        uabdl           v27.8h, v6.8b, v7.8b            // Absolue difference, fourth iteration
    +        umlal           v16.4s, v27.4h, v27.4h          // Multiply and accumulate, fourth iteration
    +
    +        uaddlv          d17, v16.4s                     // Add vector
    +        add             d18, d18, d17
    +
    +        sub             w4, w4, #4
    +        cmp             w4, #4
    +        b.ge            1b
    +
    +        cbnz            w4, 2f
    +        fmov            w0, s18
    +
    +        ret
    +
    +// iterate by one
    +2:
    +        ld1             {v0.s}[0], [x1], x3               // Load pix1
    +        ld1             {v1.s}[0], [x2], x3               // Load pix2
    +        uabdl           v30.8h, v0.8b, v1.8b
    +        umull           v16.4s, v30.4h, v30.4h
    +
    +        uaddlv          d17, v16.4s
    +        add             d18, d18, d17
    +
    +        subs            w4, w4, #1
    +        b.ne            2b
    +        fmov            w0, s18
    +
    +        ret
    +
    +endfunc
    --
    2.34.1
Swinney, Jonathan July 28, 2022, 6:51 p.m. UTC | #2
> There is one more place to move the sub, cmp and branch instructions apart in sse16_neon. It doesn't seem to make any difference to Neoverse N1 and V1 and it may help A53.

Sorry-- I meant sse4_neon.

-- 

Jonathan Swinney

On 7/28/22, 1:50 PM, "Swinney, Jonathan" <jswinney@amazon.com> wrote:

    Your latest set of patches didn’t get interpreted correctly by the patchwork tool. I suspect it took them in the wrong order. 

    https://patchwork.ffmpeg.org/project/ffmpeg/list/?series=&submitter=1479&state=&q=&archive=&delegate=

    There is one more place to move the sub, cmp and branch instructions apart in sse16_neon. It doesn't seem to make any difference to Neoverse N1 and V1 and it may help A53.

    I didn't see anything else.

    Thanks!
    -- 

    Jonathan Swinney

    On 7/25/22, 6:16 AM, "Hubert Mazur" <hum@semihalf.com> wrote:

        CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.



        Provide neon implementation for sse4 function.

        Performance comparison tests are shown below.
        - sse_2_c: 74.0
        - sse_2_neon: 24.0

        Benchmarks and tests are run with checkasm tool on AWS Graviton 3.

        Signed-off-by: Hubert Mazur <hum@semihalf.com>
        ---
         libavcodec/aarch64/me_cmp_init_aarch64.c |  3 ++
         libavcodec/aarch64/me_cmp_neon.S         | 65 ++++++++++++++++++++++++
         2 files changed, 68 insertions(+)

        diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
        index 3ff5767bd0..72a2062e7e 100644
        --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
        +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
        @@ -32,6 +32,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,

         int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                           ptrdiff_t stride, int h);
        +int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
        +                  ptrdiff_t stride, int h);

         av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
         {
        @@ -44,5 +46,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)

                 c->sad[0] = ff_pix_abs16_neon;
                 c->sse[0] = sse16_neon;
        +        c->sse[2] = sse4_neon;
             }
         }
        diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
        index 98c912b608..3336d88848 100644
        --- a/libavcodec/aarch64/me_cmp_neon.S
        +++ b/libavcodec/aarch64/me_cmp_neon.S
        @@ -352,3 +352,68 @@ function sse16_neon, export=1
                 ret

         endfunc
        +
        +function sse4_neon, export=1
        +        // x0 - unused
        +        // x1 - pix1
        +        // x2 - pix2
        +        // x3 - stride
        +        // w4 - h
        +
        +        movi            d18, #0
        +        movi            d17, #0
        +        cmp             w4, #4
        +        b.le            2f
        +
        +// make 4 iterations at once
        +1:
        +
        +        // res = abs(pix1[0] - pix2[0])
        +        // res * res
        +
        +        ld1             {v0.s}[0], [x1], x3             // Load pix1, first iteration
        +        ld1             {v1.s}[0], [x2], x3             // Load pix2, first iteration
        +        uabdl           v30.8h, v0.8b, v1.8b            // Absolute difference, first iteration
        +        ld1             {v2.s}[0], [x1], x3             // Load pix1, second iteration
        +        ld1             {v3.s}[0], [x2], x3             // Load pix2, second iteration
        +        umull           v16.4s, v30.4h, v30.4h          // Multiply vectors, first iteration
        +        uabdl           v29.8h, v2.8b, v3.8b            // Absolute difference, second iteration
        +        ld1             {v4.s}[0], [x1], x3             // Load pix1, third iteration
        +        ld1             {v5.s}[0], [x2], x3             // Load pix2, third iteration
        +        umlal           v16.4s, v29.4h, v29.4h          // Multiply and accumulate, second iteration
        +        uabdl           v28.8h, v4.8b, v5.8b            // Absolute difference, third iteration
        +        ld1             {v6.s}[0], [x1], x3             // Load pix1, fourth iteration
        +        ld1             {v7.s}[0], [x2], x3             // Load pix2, fourth iteration
        +        umlal           v16.4s, v28.4h, v28.4h          // Multiply and accumulate, third iteration
        +        uabdl           v27.8h, v6.8b, v7.8b            // Absolue difference, fourth iteration
        +        umlal           v16.4s, v27.4h, v27.4h          // Multiply and accumulate, fourth iteration
        +
        +        uaddlv          d17, v16.4s                     // Add vector
        +        add             d18, d18, d17
        +
        +        sub             w4, w4, #4
        +        cmp             w4, #4
        +        b.ge            1b
        +
        +        cbnz            w4, 2f
        +        fmov            w0, s18
        +
        +        ret
        +
        +// iterate by one
        +2:
        +        ld1             {v0.s}[0], [x1], x3               // Load pix1
        +        ld1             {v1.s}[0], [x2], x3               // Load pix2
        +        uabdl           v30.8h, v0.8b, v1.8b
        +        umull           v16.4s, v30.4h, v30.4h
        +
        +        uaddlv          d17, v16.4s
        +        add             d18, d18, d17
        +
        +        subs            w4, w4, #1
        +        b.ne            2b
        +        fmov            w0, s18
        +
        +        ret
        +
        +endfunc
        --
        2.34.1
Hubert Mazur July 29, 2022, 7:26 a.m. UTC | #3
Yes, it seems that they are misplaced or each is treated as a new series
and thus can't be applied.
I will send the whole batch again after the first review, so some issues
could be fixed.
Thanks for the feedback!

On Thu, Jul 28, 2022 at 8:51 PM Swinney, Jonathan <jswinney@amazon.com>
wrote:

> > There is one more place to move the sub, cmp and branch instructions
> apart in sse16_neon. It doesn't seem to make any difference to Neoverse N1
> and V1 and it may help A53.
>
> Sorry-- I meant sse4_neon.
>
> --
>
> Jonathan Swinney
>
> On 7/28/22, 1:50 PM, "Swinney, Jonathan" <jswinney@amazon.com> wrote:
>
>     Your latest set of patches didn’t get interpreted correctly by the
> patchwork tool. I suspect it took them in the wrong order.
>
>
> https://patchwork.ffmpeg.org/project/ffmpeg/list/?series=&submitter=1479&state=&q=&archive=&delegate=
>
>     There is one more place to move the sub, cmp and branch instructions
> apart in sse16_neon. It doesn't seem to make any difference to Neoverse N1
> and V1 and it may help A53.
>
>     I didn't see anything else.
>
>     Thanks!
>     --
>
>     Jonathan Swinney
>
>     On 7/25/22, 6:16 AM, "Hubert Mazur" <hum@semihalf.com> wrote:
>
>         CAUTION: This email originated from outside of the organization.
> Do not click links or open attachments unless you can confirm the sender
> and know the content is safe.
>
>
>
>         Provide neon implementation for sse4 function.
>
>         Performance comparison tests are shown below.
>         - sse_2_c: 74.0
>         - sse_2_neon: 24.0
>
>         Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
>
>         Signed-off-by: Hubert Mazur <hum@semihalf.com>
>         ---
>          libavcodec/aarch64/me_cmp_init_aarch64.c |  3 ++
>          libavcodec/aarch64/me_cmp_neon.S         | 65
> ++++++++++++++++++++++++
>          2 files changed, 68 insertions(+)
>
>         diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c
> b/libavcodec/aarch64/me_cmp_init_aarch64.c
>         index 3ff5767bd0..72a2062e7e 100644
>         --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
>         +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
>         @@ -32,6 +32,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v,
> uint8_t *pix1, uint8_t *pix2,
>
>          int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
>                            ptrdiff_t stride, int h);
>         +int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
>         +                  ptrdiff_t stride, int h);
>
>          av_cold void ff_me_cmp_init_aarch64(MECmpContext *c,
> AVCodecContext *avctx)
>          {
>         @@ -44,5 +46,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext
> *c, AVCodecContext *avctx)
>
>                  c->sad[0] = ff_pix_abs16_neon;
>                  c->sse[0] = sse16_neon;
>         +        c->sse[2] = sse4_neon;
>              }
>          }
>         diff --git a/libavcodec/aarch64/me_cmp_neon.S
> b/libavcodec/aarch64/me_cmp_neon.S
>         index 98c912b608..3336d88848 100644
>         --- a/libavcodec/aarch64/me_cmp_neon.S
>         +++ b/libavcodec/aarch64/me_cmp_neon.S
>         @@ -352,3 +352,68 @@ function sse16_neon, export=1
>                  ret
>
>          endfunc
>         +
>         +function sse4_neon, export=1
>         +        // x0 - unused
>         +        // x1 - pix1
>         +        // x2 - pix2
>         +        // x3 - stride
>         +        // w4 - h
>         +
>         +        movi            d18, #0
>         +        movi            d17, #0
>         +        cmp             w4, #4
>         +        b.le            2f
>         +
>         +// make 4 iterations at once
>         +1:
>         +
>         +        // res = abs(pix1[0] - pix2[0])
>         +        // res * res
>         +
>         +        ld1             {v0.s}[0], [x1], x3             // Load
> pix1, first iteration
>         +        ld1             {v1.s}[0], [x2], x3             // Load
> pix2, first iteration
>         +        uabdl           v30.8h, v0.8b, v1.8b            //
> Absolute difference, first iteration
>         +        ld1             {v2.s}[0], [x1], x3             // Load
> pix1, second iteration
>         +        ld1             {v3.s}[0], [x2], x3             // Load
> pix2, second iteration
>         +        umull           v16.4s, v30.4h, v30.4h          //
> Multiply vectors, first iteration
>         +        uabdl           v29.8h, v2.8b, v3.8b            //
> Absolute difference, second iteration
>         +        ld1             {v4.s}[0], [x1], x3             // Load
> pix1, third iteration
>         +        ld1             {v5.s}[0], [x2], x3             // Load
> pix2, third iteration
>         +        umlal           v16.4s, v29.4h, v29.4h          //
> Multiply and accumulate, second iteration
>         +        uabdl           v28.8h, v4.8b, v5.8b            //
> Absolute difference, third iteration
>         +        ld1             {v6.s}[0], [x1], x3             // Load
> pix1, fourth iteration
>         +        ld1             {v7.s}[0], [x2], x3             // Load
> pix2, fourth iteration
>         +        umlal           v16.4s, v28.4h, v28.4h          //
> Multiply and accumulate, third iteration
>         +        uabdl           v27.8h, v6.8b, v7.8b            //
> Absolue difference, fourth iteration
>         +        umlal           v16.4s, v27.4h, v27.4h          //
> Multiply and accumulate, fourth iteration
>         +
>         +        uaddlv          d17, v16.4s                     // Add
> vector
>         +        add             d18, d18, d17
>         +
>         +        sub             w4, w4, #4
>         +        cmp             w4, #4
>         +        b.ge            1b
>         +
>         +        cbnz            w4, 2f
>         +        fmov            w0, s18
>         +
>         +        ret
>         +
>         +// iterate by one
>         +2:
>         +        ld1             {v0.s}[0], [x1], x3               // Load
> pix1
>         +        ld1             {v1.s}[0], [x2], x3               // Load
> pix2
>         +        uabdl           v30.8h, v0.8b, v1.8b
>         +        umull           v16.4s, v30.4h, v30.4h
>         +
>         +        uaddlv          d17, v16.4s
>         +        add             d18, d18, d17
>         +
>         +        subs            w4, w4, #1
>         +        b.ne            2b
>         +        fmov            w0, s18
>         +
>         +        ret
>         +
>         +endfunc
>         --
>         2.34.1
>
>
>
>
Martin Storsjö Aug. 4, 2022, 8 a.m. UTC | #4
On Mon, 25 Jul 2022, Hubert Mazur wrote:

> Provide neon implementation for sse4 function.
>
> Performance comparison tests are shown below.
> - sse_2_c: 74.0
> - sse_2_neon: 24.0
>
> Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c |  3 ++
> libavcodec/aarch64/me_cmp_neon.S         | 65 ++++++++++++++++++++++++
> 2 files changed, 68 insertions(+)
>
> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> index 3ff5767bd0..72a2062e7e 100644
> --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -32,6 +32,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
> 
> int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
>                   ptrdiff_t stride, int h);
> +int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
> +                  ptrdiff_t stride, int h);
> 
> av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> {
> @@ -44,5 +46,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
>
>         c->sad[0] = ff_pix_abs16_neon;
>         c->sse[0] = sse16_neon;
> +        c->sse[2] = sse4_neon;
>     }
> }
> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index 98c912b608..3336d88848 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -352,3 +352,68 @@ function sse16_neon, export=1
>         ret
> 
> endfunc
> +
> +function sse4_neon, export=1
> +        // x0 - unused
> +        // x1 - pix1
> +        // x2 - pix2
> +        // x3 - stride
> +        // w4 - h
> +
> +        movi            d18, #0
> +        movi            d17, #0

In the current implementation, it doesn't seem like d17 needs to be 
initialized here

> +        cmp             w4, #4
> +        b.le            2f
> +
> +// make 4 iterations at once
> +1:
> +
> +        // res = abs(pix1[0] - pix2[0])
> +        // res * res
> +
> +        ld1             {v0.s}[0], [x1], x3             // Load pix1, first iteration
> +        ld1             {v1.s}[0], [x2], x3             // Load pix2, first iteration
> +        uabdl           v30.8h, v0.8b, v1.8b            // Absolute difference, first iteration

Right now, half of the values calculated by uabdl are unused; you could 
try loading two iterations into v0.s[0] and v0.s[1] so that the full 
.8b register gets used. Doing that would reduce the number of uabdl 
instructions from 4 to 2 - but it might make it harder to interleave 
instructions efficiently. So after all, maybe it's not worth if, it we 
can make the loads more efficiently interleaved this way?

Again, also here, it'd be good to interleave things more efficiently, e.g. 
like this:

    ld1 first
    ld1 first
    ld1 second
    ld1 second
    uabdl first
    ld1 third
    ld1 third
    uabdl second
    umull first
    ld1 fourth
    ld1 fourth
    uabdl third
    umlal second
    uabdl fourth
    umlal third
    umlal fourth

> +        ld1             {v2.s}[0], [x1], x3             // Load pix1, second iteration
> +        ld1             {v3.s}[0], [x2], x3             // Load pix2, second iteration
> +        umull           v16.4s, v30.4h, v30.4h          // Multiply vectors, first iteration
> +        uabdl           v29.8h, v2.8b, v3.8b            // Absolute difference, second iteration
> +        ld1             {v4.s}[0], [x1], x3             // Load pix1, third iteration
> +        ld1             {v5.s}[0], [x2], x3             // Load pix2, third iteration
> +        umlal           v16.4s, v29.4h, v29.4h          // Multiply and accumulate, second iteration
> +        uabdl           v28.8h, v4.8b, v5.8b            // Absolute difference, third iteration
> +        ld1             {v6.s}[0], [x1], x3             // Load pix1, fourth iteration
> +        ld1             {v7.s}[0], [x2], x3             // Load pix2, fourth iteration
> +        umlal           v16.4s, v28.4h, v28.4h          // Multiply and accumulate, third iteration
> +        uabdl           v27.8h, v6.8b, v7.8b            // Absolue difference, fourth iteration
> +        umlal           v16.4s, v27.4h, v27.4h          // Multiply and accumulate, fourth iteration
> +
> +        uaddlv          d17, v16.4s                     // Add vector
> +        add             d18, d18, d17

As usual, don't do any *add*v within the loop, defer it as far as 
possible. Here you're accumulating in 32 bit elements, so it will surely 
fit the results from the whole algorithm.

Also, if you get rid of the uaddlv here, you can also accumulate into two 
separate .4s registers that you only add at the end; that allows two umlal 
instructions to possibly execute in parallel without waiting for each 
other (provided that the cpu has enough execution units for that).

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 3ff5767bd0..72a2062e7e 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -32,6 +32,8 @@  int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 
 int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                   ptrdiff_t stride, int h);
+int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                  ptrdiff_t stride, int h);
 
 av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 {
@@ -44,5 +46,6 @@  av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 
         c->sad[0] = ff_pix_abs16_neon;
         c->sse[0] = sse16_neon;
+        c->sse[2] = sse4_neon;
     }
 }
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 98c912b608..3336d88848 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -352,3 +352,68 @@  function sse16_neon, export=1
         ret
 
 endfunc
+
+function sse4_neon, export=1
+        // x0 - unused
+        // x1 - pix1
+        // x2 - pix2
+        // x3 - stride
+        // w4 - h
+
+        movi            d18, #0
+        movi            d17, #0
+        cmp             w4, #4
+        b.le            2f
+
+// make 4 iterations at once
+1:
+
+        // res = abs(pix1[0] - pix2[0])
+        // res * res
+
+        ld1             {v0.s}[0], [x1], x3             // Load pix1, first iteration
+        ld1             {v1.s}[0], [x2], x3             // Load pix2, first iteration
+        uabdl           v30.8h, v0.8b, v1.8b            // Absolute difference, first iteration
+        ld1             {v2.s}[0], [x1], x3             // Load pix1, second iteration
+        ld1             {v3.s}[0], [x2], x3             // Load pix2, second iteration
+        umull           v16.4s, v30.4h, v30.4h          // Multiply vectors, first iteration
+        uabdl           v29.8h, v2.8b, v3.8b            // Absolute difference, second iteration
+        ld1             {v4.s}[0], [x1], x3             // Load pix1, third iteration
+        ld1             {v5.s}[0], [x2], x3             // Load pix2, third iteration
+        umlal           v16.4s, v29.4h, v29.4h          // Multiply and accumulate, second iteration
+        uabdl           v28.8h, v4.8b, v5.8b            // Absolute difference, third iteration
+        ld1             {v6.s}[0], [x1], x3             // Load pix1, fourth iteration
+        ld1             {v7.s}[0], [x2], x3             // Load pix2, fourth iteration
+        umlal           v16.4s, v28.4h, v28.4h          // Multiply and accumulate, third iteration
+        uabdl           v27.8h, v6.8b, v7.8b            // Absolue difference, fourth iteration
+        umlal           v16.4s, v27.4h, v27.4h          // Multiply and accumulate, fourth iteration
+
+        uaddlv          d17, v16.4s                     // Add vector
+        add             d18, d18, d17
+
+        sub             w4, w4, #4
+        cmp             w4, #4
+        b.ge            1b
+
+        cbnz            w4, 2f
+        fmov            w0, s18
+
+        ret
+
+// iterate by one
+2:
+        ld1             {v0.s}[0], [x1], x3               // Load pix1
+        ld1             {v1.s}[0], [x2], x3               // Load pix2
+        uabdl           v30.8h, v0.8b, v1.8b
+        umull           v16.4s, v30.4h, v30.4h
+
+        uaddlv          d17, v16.4s
+        add             d18, d18, d17
+
+        subs            w4, w4, #1
+        b.ne            2b
+        fmov            w0, s18
+
+        ret
+
+endfunc