diff mbox series

[FFmpeg-devel,5/5] lavc/aarch64: Provide neon implementation of nsse16

Message ID 20220906102722.53266-6-hum@semihalf.com
State New
Headers show
Series Provide optimized neon implementation | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Hubert Mazur Sept. 6, 2022, 10:27 a.m. UTC
Add vectorized implementation of nsse16 function.

Performance comparison tests are shown below.
- nsse_0_c: 707.0
- nsse_0_neon: 120.0

Benchmarks and tests run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
 libavcodec/aarch64/me_cmp_init_aarch64.c |  15 +++
 libavcodec/aarch64/me_cmp_neon.S         | 124 +++++++++++++++++++++++
 2 files changed, 139 insertions(+)

Comments

Martin Storsjö Sept. 7, 2022, 9:06 a.m. UTC | #1
On Tue, 6 Sep 2022, Hubert Mazur wrote:

> Add vectorized implementation of nsse16 function.
>
> Performance comparison tests are shown below.
> - nsse_0_c: 707.0
> - nsse_0_neon: 120.0
>
> Benchmarks and tests run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c |  15 +++
> libavcodec/aarch64/me_cmp_neon.S         | 124 +++++++++++++++++++++++
> 2 files changed, 139 insertions(+)
>
> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> index 8c295d5457..ea7f295373 100644
> --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -49,6 +49,10 @@ int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
>                 ptrdiff_t stride, int h);
> int vsse_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
>                       ptrdiff_t stride, int h);
> +int nsse16_neon(int multiplier, const uint8_t *s, const uint8_t *s2,
> +                ptrdiff_t stride, int h);
> +int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
> +                        ptrdiff_t stride, int h);
> 
> av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> {
> @@ -72,5 +76,16 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
>
>         c->vsse[0] = vsse16_neon;
>         c->vsse[4] = vsse_intra16_neon;
> +
> +        c->nsse[0] = nsse16_neon_wrapper;
>     }
> }
> +
> +int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
> +                        ptrdiff_t stride, int h)
> +{
> +        if (c)
> +            return nsse16_neon(c->avctx->nsse_weight, s1, s2, stride, h);
> +        else
> +            return nsse16_neon(8, s1, s2, stride, h);
> +}

The body of this function is still indented 4 spaces too much.

> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index cf2b8da425..bd21122a21 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -847,3 +847,127 @@ function vsse_intra16_neon, export=1
>
>         ret
> endfunc
> +
> +function nsse16_neon, export=1
> +        // x0           multiplier
> +        // x1           uint8_t *pix1
> +        // x2           uint8_t *pix2
> +        // x3           ptrdiff_t stride
> +        // w4           int h
> +
> +        str             x0, [sp, #-0x40]!
> +        stp             x1, x2, [sp, #0x10]
> +        stp             x3, x4, [sp, #0x20]
> +        str             x30, [sp, #0x30]
> +        bl              X(sse16_neon)
> +        ldr             x30, [sp, #0x30]
> +        mov             w9, w0                                  // here we store score1
> +        ldr             x5, [sp]
> +        ldp             x1, x2, [sp, #0x10]
> +        ldp             x3, x4, [sp, #0x20]
> +        add             sp, sp, #0x40
> +
> +        movi            v16.8h, #0
> +        movi            v17.8h, #0
> +        movi            v18.8h, #0
> +        movi            v19.8h, #0
> +
> +        ld1             {v0.16b}, [x1], x3
> +        subs            w4, w4, #1                              // we need to make h-1 iterations
> +        ext             v1.16b, v0.16b, v0.16b, #1              // x1 + 1
> +        ld1             {v2.16b}, [x2], x3
> +        cmp             w4, #2
> +        ext             v3.16b, v2.16b, v2.16b, #1              // x2 + 1

The scheduling here (and in both loops below) is a bit non-ideal; we use 
v0 very early after loading it, while we could start the load of v2 before 
that.

> +
> +        b.lt            2f
> +
> +// make 2 iterations at once
> +1:
> +        ld1             {v4.16b}, [x1], x3
> +        ld1             {v20.16b}, [x1], x3

The load of v20 here would stall a bit, since the first load updates x1. 
Instead we can launch the load of v6 inbetween these two

> +        ext             v5.16b, v4.16b, v4.16b, #1              // x1 + stride + 1
> +        ext             v21.16b, v20.16b, v20.16b, #1

We can move the use of v20 a bit later here, inbetween the two other 
exts.

These changes (plus a similar minor one to the non-unrolled version at the 
bottom) produces the following benchmark change for me:

Before:   Cortex A53     A72     A73
nsse_0_neon:   401.0   198.0   194.5
After:
nsse_0_neon:   377.0   198.7   196.5

(The differences on A72 and A73 are within the measurement noise, those 
numbers vary more than that from one run to another.)

You can squash in the attached patch for simplicity. Also, remember to fix 
the indentation of the wrapper function.

With those changes, plus updated benchmark numbers in the commit messages, 
I think this patchset should be good to go.

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 8c295d5457..ea7f295373 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -49,6 +49,10 @@  int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
                 ptrdiff_t stride, int h);
 int vsse_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
                       ptrdiff_t stride, int h);
+int nsse16_neon(int multiplier, const uint8_t *s, const uint8_t *s2,
+                ptrdiff_t stride, int h);
+int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
+                        ptrdiff_t stride, int h);
 
 av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 {
@@ -72,5 +76,16 @@  av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 
         c->vsse[0] = vsse16_neon;
         c->vsse[4] = vsse_intra16_neon;
+
+        c->nsse[0] = nsse16_neon_wrapper;
     }
 }
+
+int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
+                        ptrdiff_t stride, int h)
+{
+        if (c)
+            return nsse16_neon(c->avctx->nsse_weight, s1, s2, stride, h);
+        else
+            return nsse16_neon(8, s1, s2, stride, h);
+}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index cf2b8da425..bd21122a21 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -847,3 +847,127 @@  function vsse_intra16_neon, export=1
 
         ret
 endfunc
+
+function nsse16_neon, export=1
+        // x0           multiplier
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // w4           int h
+
+        str             x0, [sp, #-0x40]!
+        stp             x1, x2, [sp, #0x10]
+        stp             x3, x4, [sp, #0x20]
+        str             x30, [sp, #0x30]
+        bl              X(sse16_neon)
+        ldr             x30, [sp, #0x30]
+        mov             w9, w0                                  // here we store score1
+        ldr             x5, [sp]
+        ldp             x1, x2, [sp, #0x10]
+        ldp             x3, x4, [sp, #0x20]
+        add             sp, sp, #0x40
+
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        movi            v18.8h, #0
+        movi            v19.8h, #0
+
+        ld1             {v0.16b}, [x1], x3
+        subs            w4, w4, #1                              // we need to make h-1 iterations
+        ext             v1.16b, v0.16b, v0.16b, #1              // x1 + 1
+        ld1             {v2.16b}, [x2], x3
+        cmp             w4, #2
+        ext             v3.16b, v2.16b, v2.16b, #1              // x2 + 1
+
+        b.lt            2f
+
+// make 2 iterations at once
+1:
+        ld1             {v4.16b}, [x1], x3
+        ld1             {v20.16b}, [x1], x3
+        ext             v5.16b, v4.16b, v4.16b, #1              // x1 + stride + 1
+        ext             v21.16b, v20.16b, v20.16b, #1
+        ld1             {v6.16b}, [x2], x3
+        ld1             {v22.16b}, [x2], x3
+        ext             v7.16b, v6.16b, v6.16b, #1              // x2 + stride + 1
+        ext             v23.16b, v22.16b, v22.16b, #1
+
+        usubl           v31.8h, v0.8b, v4.8b
+        usubl2          v30.8h, v0.16b, v4.16b
+        usubl           v29.8h, v1.8b, v5.8b
+        usubl2          v28.8h, v1.16b, v5.16b
+        saba            v16.8h, v31.8h, v29.8h
+        saba            v17.8h, v30.8h, v28.8h
+        usubl           v27.8h, v2.8b, v6.8b
+        usubl2          v26.8h, v2.16b, v6.16b
+        usubl           v25.8h, v3.8b, v7.8b
+        usubl2          v24.8h, v3.16b, v7.16b
+        saba            v18.8h, v27.8h, v25.8h
+        saba            v19.8h, v26.8h, v24.8h
+
+        usubl           v31.8h, v4.8b, v20.8b
+        usubl2          v30.8h, v4.16b, v20.16b
+        usubl           v29.8h, v5.8b, v21.8b
+        usubl2          v28.8h, v5.16b, v21.16b
+        saba            v16.8h, v31.8h, v29.8h
+        saba            v17.8h, v30.8h, v28.8h
+        usubl           v27.8h, v6.8b, v22.8b
+        usubl2          v26.8h, v6.16b, v22.16b
+        usubl           v25.8h, v7.8b, v23.8b
+        usubl2          v24.8h, v7.16b, v23.16b
+        saba            v18.8h, v27.8h, v25.8h
+        saba            v19.8h, v26.8h, v24.8h
+
+        mov             v0.16b, v20.16b
+        mov             v1.16b, v21.16b
+        mov             v2.16b, v22.16b
+        mov             v3.16b, v23.16b
+
+        sub             w4, w4, #2
+        cmp             w4, #2
+
+        b.ge            1b
+        cbz             w4, 3f
+
+// iterate by one
+2:
+        ld1             {v4.16b}, [x1], x3
+        subs            w4, w4, #1
+        ext             v5.16b, v4.16b, v4.16b, #1              // x1 + stride + 1
+        ld1             {v6.16b}, [x2], x3
+        usubl           v31.8h, v0.8b, v4.8b
+        ext             v7.16b, v6.16b, v6.16b, #1              // x2 + stride + 1
+
+        usubl2          v30.8h, v0.16b, v4.16b
+        usubl           v29.8h, v1.8b, v5.8b
+        usubl2          v28.8h, v1.16b, v5.16b
+        saba            v16.8h, v31.8h, v29.8h
+        saba            v17.8h, v30.8h, v28.8h
+        usubl           v27.8h, v2.8b, v6.8b
+        usubl2          v26.8h, v2.16b, v6.16b
+        usubl           v25.8h, v3.8b, v7.8b
+        usubl2          v24.8h, v3.16b, v7.16b
+        saba            v18.8h, v27.8h, v25.8h
+        saba            v19.8h, v26.8h, v24.8h
+
+        mov             v0.16b, v4.16b
+        mov             v1.16b, v5.16b
+        mov             v2.16b, v6.16b
+        mov             v3.16b, v7.16b
+
+        cbnz            w4, 2b
+
+3:
+        sqsub           v16.8h, v16.8h, v18.8h
+        sqsub           v17.8h, v17.8h, v19.8h
+        ins             v17.h[7], wzr
+        sqadd           v16.8h, v16.8h, v17.8h
+        saddlv          s16, v16.8h
+        sqabs           s16, s16
+        fmov            w0, s16
+
+        mul             w0, w0, w5
+        add             w0, w0, w9
+
+        ret
+endfunc