diff mbox series

[FFmpeg-devel,5/5] lavc/aarch64: Provide neon implementation of nsse16

Message ID 20220822152627.1992008-6-hum@semihalf.com
State New
Headers show
Series me_cmp: Provide arm64 neon implementations | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished

Commit Message

Hubert Mazur Aug. 22, 2022, 3:26 p.m. UTC
Add vectorized implementation of nsse16 function.

Performance comparison tests are shown below.
- nsse_0_c: 707.0
- nsse_0_neon: 120.0

Benchmarks and tests run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
 libavcodec/aarch64/me_cmp_init_aarch64.c |  15 +++
 libavcodec/aarch64/me_cmp_neon.S         | 126 +++++++++++++++++++++++
 2 files changed, 141 insertions(+)

Comments

Martin Storsjö Sept. 2, 2022, 9:29 p.m. UTC | #1
On Mon, 22 Aug 2022, Hubert Mazur wrote:

> Add vectorized implementation of nsse16 function.
>
> Performance comparison tests are shown below.
> - nsse_0_c: 707.0
> - nsse_0_neon: 120.0
>
> Benchmarks and tests run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c |  15 +++
> libavcodec/aarch64/me_cmp_neon.S         | 126 +++++++++++++++++++++++
> 2 files changed, 141 insertions(+)
>
> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index 46d4dade5d..9fe96e111c 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -889,3 +889,129 @@ function vsse_intra16_neon, export=1
>
>         ret
> endfunc
> +
> +function nsse16_neon, export=1
> +        // x0           multiplier
> +        // x1           uint8_t *pix1
> +        // x2           uint8_t *pix2
> +        // x3           ptrdiff_t stride
> +        // w4           int h
> +
> +        str             x0, [sp, #-0x40]!
> +        stp             x1, x2, [sp, #0x10]
> +        stp             x3, x4, [sp, #0x20]
> +        str             lr, [sp, #0x30]
> +        bl              sse16_neon
> +        ldr             lr, [sp, #0x30]

This breaks building in two configurations; old binutils doesn't recognize 
the register name lr, you need to spell out x30.

Building on macOS breaks since there's no symbol named sse16_neon; this is 
an exported function, so it has got the symbol prefix _. So you need to do 
"bl X(sse16_neon)" here.

Didn't look at the code from a performance perspective yet.

// Martin
Martin Storsjö Sept. 4, 2022, 9:23 p.m. UTC | #2
On Mon, 22 Aug 2022, Hubert Mazur wrote:

> Add vectorized implementation of nsse16 function.
>
> Performance comparison tests are shown below.
> - nsse_0_c: 707.0
> - nsse_0_neon: 120.0
>
> Benchmarks and tests run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c |  15 +++
> libavcodec/aarch64/me_cmp_neon.S         | 126 +++++++++++++++++++++++
> 2 files changed, 141 insertions(+)
>
> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> index 8c295d5457..146ef04345 100644
> --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -49,6 +49,10 @@ int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
>                 ptrdiff_t stride, int h);
> int vsse_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
>                       ptrdiff_t stride, int h);
> +int nsse16_neon(int multiplier, const uint8_t *s, const uint8_t *s2,
> +                ptrdiff_t stride, int h);
> +int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
> +                        ptrdiff_t stride, int h);
> 
> av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> {
> @@ -72,5 +76,16 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
>
>         c->vsse[0] = vsse16_neon;
>         c->vsse[4] = vsse_intra16_neon;
> +
> +        c->nsse[0] = nsse16_neon_wrapper;
>     }
> }
> +
> +int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
> +                        ptrdiff_t stride, int h)
> +{
> +        if (c)
> +            return nsse16_neon(c->avctx->nsse_weight, s1, s2, stride, h);
> +        else
> +            return nsse16_neon(8, s1, s2, stride, h);
> +}
> \ No newline at end of file

The indentation is off for this file, and it's missing the final newline.

> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index 46d4dade5d..9fe96e111c 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -889,3 +889,129 @@ function vsse_intra16_neon, export=1
>
>         ret
> endfunc
> +
> +function nsse16_neon, export=1
> +        // x0           multiplier
> +        // x1           uint8_t *pix1
> +        // x2           uint8_t *pix2
> +        // x3           ptrdiff_t stride
> +        // w4           int h
> +
> +        str             x0, [sp, #-0x40]!
> +        stp             x1, x2, [sp, #0x10]
> +        stp             x3, x4, [sp, #0x20]
> +        str             lr, [sp, #0x30]
> +        bl              sse16_neon
> +        ldr             lr, [sp, #0x30]
> +        mov             w9, w0                                  // here we store score1
> +        ldr             x5, [sp]
> +        ldp             x1, x2, [sp, #0x10]
> +        ldp             x3, x4, [sp, #0x20]
> +        add             sp, sp, #0x40
> +
> +        movi            v16.8h, #0
> +        movi            v17.8h, #0
> +        movi            v18.8h, #0
> +        movi            v19.8h, #0
> +
> +        mov             x10, x1                                 // x1
> +        mov             x14, x2                                 // x2

I don't see why you need to make a copy of x1/x2 here, as you don't use 
x1/x2 after this at all.

> +        add             x11, x1, x3                             // x1 + stride
> +        add             x15, x2, x3                             // x2 + stride
> +        add             x12, x1, #1                             // x1 + 1
> +        add             x16, x2, #1                             // x2 + 1

FWIW, instead of making two loads, for [x1] and [x1+1], as we don't need 
the final value at [x1+16], I would normally just do one load of [x1] and 
then make a shifted version with the 'ext' instruction; ext is generally 
cheaper than doing redundant loads. On the other hand, by doing two loads, 
you don't have a serial dependency on the first load.


> +// iterate by one
> +2:
> +        ld1             {v0.16b}, [x10], x3
> +        ld1             {v1.16b}, [x11], x3
> +        ld1             {v2.16b}, [x12], x3
> +        usubl           v31.8h, v0.8b, v1.8b
> +        ld1             {v3.16b}, [x13], x3
> +        usubl2          v30.8h, v0.16b, v1.16b
> +        usubl           v29.8h, v2.8b, v3.8b
> +        usubl2          v28.8h, v2.16b, v3.16b
> +        saba            v16.8h, v31.8h, v29.8h
> +        ld1             {v4.16b}, [x14], x3
> +        ld1             {v5.16b}, [x15], x3
> +        saba            v17.8h, v30.8h, v28.8h
> +        ld1             {v6.16b}, [x16], x3
> +        usubl           v27.8h, v4.8b, v5.8b
> +        ld1             {v7.16b}, [x17], x3

So, looking at the main implementation structure here, by looking at the 
non-unrolled version: You're doing 8 loads per iteration here - and I 
would say you can do this with 2 loads per iteration.

By reusing the loaded data from the previous iteration instead of 
duplicated loading, you can get this down from 8 to 4 loads. And by 
shifting with 'ext' instead of a separate load, you can get it down to 2 
loads. (Then again, with 4 loads instead of 2, you can have the 
overlapping loads running in parallel, instead of having to wait for the 
first load to complete if using ext. I'd suggest trying both and seeing 
which one worke better - although the tradeoff might be different between 
different cores. But storing data from the previous line instead of such 
duplicated loading is certainly better in any case.)

> +        usubl2          v26.8h, v4.16b, v5.16b
> +        usubl           v25.8h, v6.8b, v7.8b
> +        usubl2          v24.8h, v6.16b, v7.16b
> +        saba            v18.8h, v27.8h, v25.8h
> +        subs            w4, w4, #1
> +        saba            v19.8h, v26.8h, v24.8h
> +
> +        cbnz            w4, 2b
> +
> +3:
> +        sqsub           v16.8h, v16.8h, v18.8h
> +        sqsub           v17.8h, v17.8h, v19.8h
> +        ins             v17.h[7], wzr

This is very good, that you figured out how to handle the odd element here 
outside of the loops!

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 8c295d5457..146ef04345 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -49,6 +49,10 @@  int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
                 ptrdiff_t stride, int h);
 int vsse_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
                       ptrdiff_t stride, int h);
+int nsse16_neon(int multiplier, const uint8_t *s, const uint8_t *s2,
+                ptrdiff_t stride, int h);
+int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
+                        ptrdiff_t stride, int h);
 
 av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 {
@@ -72,5 +76,16 @@  av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 
         c->vsse[0] = vsse16_neon;
         c->vsse[4] = vsse_intra16_neon;
+
+        c->nsse[0] = nsse16_neon_wrapper;
     }
 }
+
+int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
+                        ptrdiff_t stride, int h)
+{
+        if (c)
+            return nsse16_neon(c->avctx->nsse_weight, s1, s2, stride, h);
+        else
+            return nsse16_neon(8, s1, s2, stride, h);
+}
\ No newline at end of file
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 46d4dade5d..9fe96e111c 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -889,3 +889,129 @@  function vsse_intra16_neon, export=1
 
         ret
 endfunc
+
+function nsse16_neon, export=1
+        // x0           multiplier
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // w4           int h
+
+        str             x0, [sp, #-0x40]!
+        stp             x1, x2, [sp, #0x10]
+        stp             x3, x4, [sp, #0x20]
+        str             lr, [sp, #0x30]
+        bl              sse16_neon
+        ldr             lr, [sp, #0x30]
+        mov             w9, w0                                  // here we store score1
+        ldr             x5, [sp]
+        ldp             x1, x2, [sp, #0x10]
+        ldp             x3, x4, [sp, #0x20]
+        add             sp, sp, #0x40
+
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        movi            v18.8h, #0
+        movi            v19.8h, #0
+
+        mov             x10, x1                                 // x1
+        mov             x14, x2                                 // x2
+        add             x11, x1, x3                             // x1 + stride
+        add             x15, x2, x3                             // x2 + stride
+        add             x12, x1, #1                             // x1 + 1
+        add             x16, x2, #1                             // x2 + 1
+        add             x13, x11, #1                            // x1 + stride + 1
+        add             x17, x15, #1                            // x2 + stride + 1
+
+        subs            w4, w4, #1                              // we need to make h-1 iterations
+        cmp             w4, #2
+        b.lt            2f
+
+// make 2 iterations at once
+1:
+        ld1             {v0.16b}, [x10], x3
+        ld1             {v1.16b}, [x11], x3
+        ld1             {v2.16b}, [x12], x3
+        usubl           v31.8h, v0.8b, v1.8b
+        ld1             {v3.16b}, [x13], x3
+        usubl2          v30.8h, v0.16b, v1.16b
+        usubl           v29.8h, v2.8b, v3.8b
+        usubl2          v28.8h, v2.16b, v3.16b
+        ld1             {v4.16b}, [x14], x3
+        saba            v16.8h, v31.8h, v29.8h
+        ld1             {v5.16b}, [x15], x3
+        ld1             {v6.16b}, [x16], x3
+        saba            v17.8h, v30.8h, v28.8h
+        ld1             {v7.16b}, [x17], x3
+        usubl           v27.8h, v4.8b, v5.8b
+        usubl2          v26.8h, v4.16b, v5.16b
+        usubl           v25.8h, v6.8b, v7.8b
+        ld1             {v31.16b}, [x10], x3
+        saba            v18.8h, v27.8h, v25.8h
+        usubl2          v24.8h, v6.16b, v7.16b
+        ld1             {v30.16b}, [x11], x3
+        ld1             {v29.16b}, [x12], x3
+        saba            v19.8h, v26.8h, v24.8h
+        usubl           v23.8h, v31.8b, v30.8b
+        ld1             {v28.16b}, [x13], x3
+        usubl2          v22.8h, v31.16b, v30.16b
+        usubl           v21.8h, v29.8b, v28.8b
+        ld1             {v27.16b}, [x14], x3
+        usubl2          v20.8h, v29.16b, v28.16b
+        saba            v16.8h, v23.8h, v21.8h
+        ld1             {v26.16b}, [x15], x3
+        ld1             {v25.16b}, [x16], x3
+        saba            v17.8h, v22.8h, v20.8h
+        ld1             {v24.16b}, [x17], x3
+        usubl           v31.8h, v27.8b, v26.8b
+        usubl           v29.8h, v25.8b, v24.8b
+        usubl2          v30.8h, v27.16b, v26.16b
+        saba            v18.8h, v31.8h, v29.8h
+        usubl2          v28.8h, v25.16b, v24.16b
+        sub             w4, w4, #2
+        cmp             w4, #2
+        saba            v19.8h, v30.8h, v28.8h
+
+        b.ge            1b
+        cbz             w4, 3f
+
+// iterate by one
+2:
+        ld1             {v0.16b}, [x10], x3
+        ld1             {v1.16b}, [x11], x3
+        ld1             {v2.16b}, [x12], x3
+        usubl           v31.8h, v0.8b, v1.8b
+        ld1             {v3.16b}, [x13], x3
+        usubl2          v30.8h, v0.16b, v1.16b
+        usubl           v29.8h, v2.8b, v3.8b
+        usubl2          v28.8h, v2.16b, v3.16b
+        saba            v16.8h, v31.8h, v29.8h
+        ld1             {v4.16b}, [x14], x3
+        ld1             {v5.16b}, [x15], x3
+        saba            v17.8h, v30.8h, v28.8h
+        ld1             {v6.16b}, [x16], x3
+        usubl           v27.8h, v4.8b, v5.8b
+        ld1             {v7.16b}, [x17], x3
+        usubl2          v26.8h, v4.16b, v5.16b
+        usubl           v25.8h, v6.8b, v7.8b
+        usubl2          v24.8h, v6.16b, v7.16b
+        saba            v18.8h, v27.8h, v25.8h
+        subs            w4, w4, #1
+        saba            v19.8h, v26.8h, v24.8h
+
+        cbnz            w4, 2b
+
+3:
+        sqsub           v16.8h, v16.8h, v18.8h
+        sqsub           v17.8h, v17.8h, v19.8h
+        ins             v17.h[7], wzr
+        sqadd           v16.8h, v16.8h, v17.8h
+        saddlv          s16, v16.8h
+        sqabs           s16, s16
+        fmov            w0, s16
+
+        mul             w0, w0, w5
+        add             w0, w0, w9
+
+        ret
+endfunc