diff mbox series

[FFmpeg-devel,2/5] lavc/aarch64: Add neon implementation for sse4

Message ID 20220715080228.686736-3-hum@semihalf.com
State New
Headers show
Series Add neon implementation for me_cmp functions | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Hubert Mazur July 15, 2022, 8:02 a.m. UTC
Provide neon implementation for sse4 function.

Performance comparison tests are shown below.
- sse_2_c: 74.0
- sse_2_neon: 24.0

Benchmarks and tests are run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
 libavcodec/aarch64/me_cmp_init_aarch64.c |  3 ++
 libavcodec/aarch64/me_cmp_neon.S         | 65 ++++++++++++++++++++++++
 2 files changed, 68 insertions(+)

Comments

Martin Storsjö July 21, 2022, 9:43 p.m. UTC | #1
On Fri, 15 Jul 2022, Hubert Mazur wrote:

> Provide neon implementation for sse4 function.
>
> Performance comparison tests are shown below.
> - sse_2_c: 74.0
> - sse_2_neon: 24.0
>
> Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c |  3 ++
> libavcodec/aarch64/me_cmp_neon.S         | 65 ++++++++++++++++++++++++
> 2 files changed, 68 insertions(+)
>
> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> index 3ff5767bd0..72a2062e7e 100644
> --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -32,6 +32,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
> 
> int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
>                   ptrdiff_t stride, int h);
> +int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
> +                  ptrdiff_t stride, int h);
> 
> av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> {
> @@ -44,5 +46,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
>
>         c->sad[0] = ff_pix_abs16_neon;
>         c->sse[0] = sse16_neon;
> +        c->sse[2] = sse4_neon;
>     }
> }
> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index 88cd335443..bacf151314 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -360,3 +360,68 @@ function sse16_neon, export=1
>         ret
> 
> endfunc
> +
> +function sse4_neon, export=1
> +        // x0 - unused
> +        // x1 - pix1
> +        // x2 - pix2
> +        // x3 - stride
> +        // w4 - h
> +
> +        movi            d18, #0
> +        movi            d17, #0
> +        cmp             w4, #4
> +        b.le            2f
> +
> +// make 4 iterations at once
> +1:
> +
> +        // res = abs(pix1[0] - pix2[0])
> +        // res * res
> +
> +        ld1             {v0.4b}, [x1], x3

This fails to assemble for me with essentially all tools I have (old 
binutils, moderately recent binutils, current llvm, MS armasm64.exe):

src/libavcodec/aarch64/me_cmp_neon.S:374: Error: operand mismatch -- `ld1 
{v0.4b},[x1],x3'
src/libavcodec/aarch64/me_cmp_neon.S:374: Info:    did you mean this?
src/libavcodec/aarch64/me_cmp_neon.S:374: Info:         ld1 {v0.8b}, [x1], x3
src/libavcodec/aarch64/me_cmp_neon.S:374: Info:    other valid variant(s):
src/libavcodec/aarch64/me_cmp_neon.S:374: Info:         ld1 {v0.16b}, [x1], x3
src/libavcodec/aarch64/me_cmp_neon.S:374: Info:         ld1 {v0.4h}, [x1], x3
src/libavcodec/aarch64/me_cmp_neon.S:374: Info:         ld1 {v0.8h}, [x1], x3
src/libavcodec/aarch64/me_cmp_neon.S:374: Info:         ld1 {v0.2s}, [x1], x3
src/libavcodec/aarch64/me_cmp_neon.S:374: Info:         ld1 {v0.4s}, [x1], x3
src/libavcodec/aarch64/me_cmp_neon.S:374: Info:         ld1 {v0.1d}, [x1], x3
src/libavcodec/aarch64/me_cmp_neon.S:374: Info:         ld1 {v0.2d}, [x1], x3


I'll follow up with an actual review of the patches later. I'm sorry I 
have a bit longer review latency than usual at the moment, as I'm on 
vacation.

// Martin
Swinney, Jonathan July 22, 2022, 9:30 p.m. UTC | #2
As Martin noted, this patch doesn't build. But other than, that, it would be nice if there were comments on each line at least making some note about which of the 4 iterations each instruction calculates. That would make it a little bit easier to read, in my opinion, since the instructions are manually reordered.

Thanks,
-- 

Jonathan Swinney

On 7/15/22, 3:03 AM, "Hubert Mazur" <hum@semihalf.com> wrote:

    CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.



    Provide neon implementation for sse4 function.

    Performance comparison tests are shown below.
    - sse_2_c: 74.0
    - sse_2_neon: 24.0

    Benchmarks and tests are run with checkasm tool on AWS Graviton 3.

    Signed-off-by: Hubert Mazur <hum@semihalf.com>
    ---
     libavcodec/aarch64/me_cmp_init_aarch64.c |  3 ++
     libavcodec/aarch64/me_cmp_neon.S         | 65 ++++++++++++++++++++++++
     2 files changed, 68 insertions(+)

    diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
    index 3ff5767bd0..72a2062e7e 100644
    --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
    +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
    @@ -32,6 +32,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,

     int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                       ptrdiff_t stride, int h);
    +int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
    +                  ptrdiff_t stride, int h);

     av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
     {
    @@ -44,5 +46,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)

             c->sad[0] = ff_pix_abs16_neon;
             c->sse[0] = sse16_neon;
    +        c->sse[2] = sse4_neon;
         }
     }
    diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
    index 88cd335443..bacf151314 100644
    --- a/libavcodec/aarch64/me_cmp_neon.S
    +++ b/libavcodec/aarch64/me_cmp_neon.S
    @@ -360,3 +360,68 @@ function sse16_neon, export=1
             ret

     endfunc
    +
    +function sse4_neon, export=1
    +        // x0 - unused
    +        // x1 - pix1
    +        // x2 - pix2
    +        // x3 - stride
    +        // w4 - h
    +
    +        movi            d18, #0
    +        movi            d17, #0
    +        cmp             w4, #4
    +        b.le            2f
    +
    +// make 4 iterations at once
    +1:
    +
    +        // res = abs(pix1[0] - pix2[0])
    +        // res * res
    +
    +        ld1             {v0.4b}, [x1], x3
    +        ld1             {v1.4b}, [x2], x3
    +        uabdl           v30.8h, v0.4b, v1.4b
    +        ld1             {v2.4b}, [x1], x3
    +        ld1             {v3.4b}, [x2], x3
    +        umull           v16.4s, v30.4h, v30.4h
    +        uabdl           v29.8h, v2.4b, v3.4b
    +        ld1             {v4.4b}, [x1], x3
    +        ld1             {v5.4b}, [x2], x3
    +        umlal           v16.4s, v29.4h, v29.4h
    +        uabdl           v28.8h, v4.4b, v5.4b
    +        ld1             {v6.4b}, [x1], x3
    +        ld1             {v7.4b}, [x2], x3
    +        umlal           v16.4s, v28.4h, v28.4h
    +        uabdl           v27.8h, v6.4b, v7.4b
    +        umlal           v16.4s, v27.4h, v27.4h
    +
    +        uaddlv          d17, v16.4s
    +        add             d18, d18, d17
    +
    +        sub             w4, w4, #4
    +        cmp             w4, #4
    +        b.ge            1b
    +
    +        cbnz            w4, 2f
    +        fmov            w0, s18
    +
    +        ret
    +
    +// iterate by one
    +2:
    +        ld1             {v0.4b}, [x1], x3
    +        ld1             {v1.4b}, [x2], x3
    +        uabdl           v30.8h, v0.4b, v1.4b
    +        umull           v16.4s, v30.4h, v30.4h
    +
    +        uaddlv          d17, v16.4s
    +        add             d18, d18, d17
    +
    +        subs            w4, w4, #1
    +        b.ne            2b
    +        fmov            w0, s18
    +
    +        ret
    +
    +endfunc
    --
    2.34.1
diff mbox series

Patch

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 3ff5767bd0..72a2062e7e 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -32,6 +32,8 @@  int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 
 int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                   ptrdiff_t stride, int h);
+int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                  ptrdiff_t stride, int h);
 
 av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 {
@@ -44,5 +46,6 @@  av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 
         c->sad[0] = ff_pix_abs16_neon;
         c->sse[0] = sse16_neon;
+        c->sse[2] = sse4_neon;
     }
 }
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 88cd335443..bacf151314 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -360,3 +360,68 @@  function sse16_neon, export=1
         ret
 
 endfunc
+
+function sse4_neon, export=1
+        // x0 - unused
+        // x1 - pix1
+        // x2 - pix2
+        // x3 - stride
+        // w4 - h
+
+        movi            d18, #0
+        movi            d17, #0
+        cmp             w4, #4
+        b.le            2f
+
+// make 4 iterations at once
+1:
+
+        // res = abs(pix1[0] - pix2[0])
+        // res * res
+
+        ld1             {v0.4b}, [x1], x3
+        ld1             {v1.4b}, [x2], x3
+        uabdl           v30.8h, v0.4b, v1.4b
+        ld1             {v2.4b}, [x1], x3
+        ld1             {v3.4b}, [x2], x3
+        umull           v16.4s, v30.4h, v30.4h
+        uabdl           v29.8h, v2.4b, v3.4b
+        ld1             {v4.4b}, [x1], x3
+        ld1             {v5.4b}, [x2], x3
+        umlal           v16.4s, v29.4h, v29.4h
+        uabdl           v28.8h, v4.4b, v5.4b
+        ld1             {v6.4b}, [x1], x3
+        ld1             {v7.4b}, [x2], x3
+        umlal           v16.4s, v28.4h, v28.4h
+        uabdl           v27.8h, v6.4b, v7.4b
+        umlal           v16.4s, v27.4h, v27.4h
+
+        uaddlv          d17, v16.4s
+        add             d18, d18, d17
+
+        sub             w4, w4, #4
+        cmp             w4, #4
+        b.ge            1b
+
+        cbnz            w4, 2f
+        fmov            w0, s18
+
+        ret
+
+// iterate by one
+2:
+        ld1             {v0.4b}, [x1], x3
+        ld1             {v1.4b}, [x2], x3
+        uabdl           v30.8h, v0.4b, v1.4b
+        umull           v16.4s, v30.4h, v30.4h
+
+        uaddlv          d17, v16.4s
+        add             d18, d18, d17
+
+        subs            w4, w4, #1
+        b.ne            2b
+        fmov            w0, s18
+
+        ret
+
+endfunc