diff mbox series

[FFmpeg-devel,1/5] lavc/aarch64: Add neon implementation for vsad16

Message ID 20220822152627.1992008-2-hum@semihalf.com
State New
Headers show
Series me_cmp: Provide arm64 neon implementations | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Hubert Mazur Aug. 22, 2022, 3:26 p.m. UTC
Provide optimized implementation of vsad16 function for arm64.

Performance comparison tests are shown below.
- vsad_0_c: 285.0
- vsad_0_neon: 42.5

Benchmarks and tests are run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
 libavcodec/aarch64/me_cmp_init_aarch64.c |  5 ++
 libavcodec/aarch64/me_cmp_neon.S         | 75 ++++++++++++++++++++++++
 2 files changed, 80 insertions(+)

Comments

Martin Storsjö Sept. 2, 2022, 9:49 p.m. UTC | #1
On Mon, 22 Aug 2022, Hubert Mazur wrote:

> Provide optimized implementation of vsad16 function for arm64.
>
> Performance comparison tests are shown below.
> - vsad_0_c: 285.0
> - vsad_0_neon: 42.5
>
> Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c |  5 ++
> libavcodec/aarch64/me_cmp_neon.S         | 75 ++++++++++++++++++++++++
> 2 files changed, 80 insertions(+)
>
> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index 4198985c6c..d4c0099854 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -584,3 +584,78 @@ function sse4_neon, export=1
>
>         ret
> endfunc
> +
> +function vsad16_neon, export=1
> +        // x0           unused
> +        // x1           uint8_t *pix1
> +        // x2           uint8_t *pix2
> +        // x3           ptrdiff_t stride
> +        // w4           int h
> +
> +        sub             w4, w4, #1                      // we need to make h-1 iterations
> +        movi            v16.8h, #0
> +
> +        cmp             w4, #3                          // check if we can make 3 iterations at once
> +        add             x5, x1, x3                      // pix1 + stride
> +        add             x6, x2, x3                      // pix2 + stride
> +        b.le            2f
> +
> +1:
> +        // abs(pix1[0] - pix2[0] - pix1[0 + stride] + pix2[0 + stride])
> +        // abs(x) = (x < 0 ? (-x) : (x))

This comment (the second line) about abs() doesn't really add anything of 
value here, does it?

> +        ld1             {v0.16b}, [x1], x3              // Load pix1[0], first iteration
> +        ld1             {v1.16b}, [x2], x3              // Load pix2[0], first iteration
> +        ld1             {v2.16b}, [x5], x3              // Load pix1[0 + stride], first iteration
> +        usubl           v31.8h, v0.8b, v1.8b            // Signed difference pix1[0] - pix2[0], first iteration
> +        ld1             {v3.16b}, [x6], x3              // Load pix2[0 + stride], first iteration
> +        usubl2          v30.8h, v0.16b, v1.16b          // Signed difference pix1[0] - pix2[0], first iteration
> +        usubl           v29.8h, v2.8b, v3.8b            // Signed difference pix1[0 + stride] - pix2[0 + stride], first iteration
> +        ld1             {v4.16b}, [x1], x3              // Load pix1[0], second iteration
> +        usubl2          v28.8h, v2.16b, v3.16b          // Signed difference pix1[0 + stride] - pix2[0 + stride], first iteration
> +        ld1             {v5.16b}, [x2], x3              // Load pix2[0], second iteration
> +        saba            v16.8h, v31.8h, v29.8h          // Signed absolute difference and accumulate the result. first iteration
> +        ld1             {v6.16b}, [x5], x3              // Load pix1[0 + stride], second iteration
> +        saba            v16.8h, v30.8h, v28.8h          // Signed absolute difference and accumulate the result. first iteration
> +        usubl           v27.8h, v4.8b, v5.8b            // Signed difference pix1[0] - pix2[0], second iteration
> +        ld1             {v7.16b}, [x6], x3              // Load pix2[0 + stride], second iteration
> +        usubl2          v26.8h, v4.16b, v5.16b          // Signed difference pix1[0] - pix2[0], second iteration
> +        usubl           v25.8h, v6.8b, v7.8b            // Signed difference pix1[0 + stride] - pix2[0 + stride], second iteration
> +        ld1             {v17.16b}, [x1], x3             // Load pix1[0], third iteration
> +        usubl2          v24.8h, v6.16b, v7.16b          // Signed difference pix1[0 + stride] - pix2[0 + stride], second iteration
> +        ld1             {v18.16b}, [x2], x3             // Load pix2[0], second iteration
> +        saba            v16.8h, v27.8h, v25.8h          // Signed absolute difference and accumulate the result. second iteration
> +        ld1             {v19.16b}, [x5], x3             // Load pix1[0 + stride], third iteration
> +        saba            v16.8h, v26.8h, v24.8h          // Signed absolute difference and accumulate the result. second iteration
> +        usubl           v23.8h, v17.8b, v18.8b          // Signed difference pix1[0] - pix2[0], third iteration
> +        ld1             {v20.16b}, [x6], x3             // Load pix2[0 + stride], third iteration
> +        usubl2          v22.8h, v17.16b, v18.16b        // Signed difference pix1[0] - pix2[0], third iteration
> +        usubl           v21.8h, v19.8b, v20.8b          // Signed difference pix1[0 + stride] - pix2[0 + stride], third iteration
> +        sub             w4, w4, #3                      // h -= 3
> +        saba            v16.8h, v23.8h, v21.8h          // Signed absolute difference and accumulate the result. third iteration
> +        usubl2          v31.8h, v19.16b, v20.16b        // Signed difference pix1[0 + stride] - pix2[0 + stride], third iteration
> +        cmp             w4, #3
> +        saba            v16.8h, v22.8h, v31.8h          // Signed absolute difference and accumulate the result. third iteration

This unrolled implementation isn't really interleaved much at all. In such 
a case there's not really much benefit from an unrolled implementation, 
other than saving a couple decrements/branch instructions.

I tested playing with your code here. Originally, checkasm gave me these 
benchmark numbers:

          Cortex A53    A72    A73
vsad_0_neon:  162.7   74.5   67.7

When I switched to only running the non-unrolled version below, I got 
these numbers:

vsad_0_neon:  165.2   78.5   76.0

I.e. hardly any difference at all. In such a case, we're just wasting a 
lot of code size (and code maintainability!) on big unrolled code without 
much benefit at all. But we can do better.

> +
> +        b.ge            1b
> +        cbz             w4, 3f
> +2:
> +
> +        ld1             {v0.16b}, [x1], x3
> +        ld1             {v1.16b}, [x2], x3
> +        ld1             {v2.16b}, [x5], x3
> +        usubl           v30.8h, v0.8b, v1.8b
> +        ld1             {v3.16b}, [x6], x3
> +        usubl2          v29.8h, v0.16b, v1.16b
> +        usubl           v28.8h, v2.8b, v3.8b
> +        usubl2          v27.8h, v2.16b, v3.16b
> +        saba            v16.8h, v30.8h, v28.8h
> +        subs            w4, w4, #1
> +        saba            v16.8h, v29.8h, v27.8h

Now let's look at this simple non-unrolled implementation first. We're 
doing a lot of redundant work here (and in the other implementation).

When we've loaded v2/v3 in the first round in this loop, and calculated 
v28/v27 from them, and we then proceed to the next iteration, v0/v1 will 
be identical to v2/v3, and v30/v29 will be identical to v28/v27 from the 
previous iteration.

So if we just load one row from pix1/pix2 each and calculate their 
difference in the start of the function, and then just load one row from 
each, subtract them and accumulate the difference to the previous one, 
we're done.

Originally with your non-unrolled implementation, I got these benchmark 
numbers:

vsad_0_neon:  165.2   78.5   76.0

After this simplification, I got these numbers:

vsad_0_neon:  131.2   68.5   70.0

In this case, my code looked like this:

         ld1             {v0.16b}, [x1], x3
         ld1             {v1.16b}, [x2], x3
         usubl           v31.8h, v0.8b,  v1.8b
         usubl2          v30.8h, v0.16b, v1.16b

2:
         ld1             {v0.16b}, [x1], x3
         ld1             {v1.16b}, [x2], x3
         subs            w4, w4, #1
         usubl           v29.8h,  v0.8b,   v1.8b
         usubl2          v28.8h,  v0.16b,  v1.16b
         saba            v16.8h,  v31.8h,  v29.8h
         mov             v31.16b, v29.16b
         saba            v16.8h,  v30.8h,  v28.8h
         mov             v30.16b, v28.16b
         b.ne            2b

Isn't that much simpler? Now after that, I tried doing the same 
modification to the unrolled version. FWIW, I did it like this: For an 
algorithm this simple, where there's more than enough registers, I wrote 
the unrolled implementation like this:


     ld1 // first iter
     ld1 // first iter
     ld1 // second iter
     ld1 // second iter
     ld1 // third iter
     ld1 // third iter
     usubl  // first
     usubl2 // first
     usubl ..
     usubl2 ..
     ...
     saba // first
     saba // first
     ...

After that, I reordered them so that we start with the first usubl's after 
a couple of instructions after the corresponding loads, then moved the 
following saba's closer.

With that, I'm getting these checkasm numbers:

          Cortex A53    A72    A73
vsad_0_neon:  108.7   61.2   58.0

As context, this patch originally gave these numbers:

vsad_0_neon:  162.7   74.5   67.7

I.e. a 1.15x speedup on A73, 1.21x speedup on A72 and an 1.5x speedup on 
A53.

You can see my version in the attached patch; apply it on top of yours.

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index fb7c3f5059..ddc5d05611 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -41,6 +41,9 @@  int sse8_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
 int sse4_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
               ptrdiff_t stride, int h);
 
+int vsad16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
+                ptrdiff_t stride, int h);
+
 av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -57,5 +60,7 @@  av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
         c->sse[0] = sse16_neon;
         c->sse[1] = sse8_neon;
         c->sse[2] = sse4_neon;
+
+        c->vsad[0] = vsad16_neon;
     }
 }
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 4198985c6c..d4c0099854 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -584,3 +584,78 @@  function sse4_neon, export=1
 
         ret
 endfunc
+
+function vsad16_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // w4           int h
+
+        sub             w4, w4, #1                      // we need to make h-1 iterations
+        movi            v16.8h, #0
+
+        cmp             w4, #3                          // check if we can make 3 iterations at once
+        add             x5, x1, x3                      // pix1 + stride
+        add             x6, x2, x3                      // pix2 + stride
+        b.le            2f
+
+1:
+        // abs(pix1[0] - pix2[0] - pix1[0 + stride] + pix2[0 + stride])
+        // abs(x) = (x < 0 ? (-x) : (x))
+        ld1             {v0.16b}, [x1], x3              // Load pix1[0], first iteration
+        ld1             {v1.16b}, [x2], x3              // Load pix2[0], first iteration
+        ld1             {v2.16b}, [x5], x3              // Load pix1[0 + stride], first iteration
+        usubl           v31.8h, v0.8b, v1.8b            // Signed difference pix1[0] - pix2[0], first iteration
+        ld1             {v3.16b}, [x6], x3              // Load pix2[0 + stride], first iteration
+        usubl2          v30.8h, v0.16b, v1.16b          // Signed difference pix1[0] - pix2[0], first iteration
+        usubl           v29.8h, v2.8b, v3.8b            // Signed difference pix1[0 + stride] - pix2[0 + stride], first iteration
+        ld1             {v4.16b}, [x1], x3              // Load pix1[0], second iteration
+        usubl2          v28.8h, v2.16b, v3.16b          // Signed difference pix1[0 + stride] - pix2[0 + stride], first iteration
+        ld1             {v5.16b}, [x2], x3              // Load pix2[0], second iteration
+        saba            v16.8h, v31.8h, v29.8h          // Signed absolute difference and accumulate the result. first iteration
+        ld1             {v6.16b}, [x5], x3              // Load pix1[0 + stride], second iteration
+        saba            v16.8h, v30.8h, v28.8h          // Signed absolute difference and accumulate the result. first iteration
+        usubl           v27.8h, v4.8b, v5.8b            // Signed difference pix1[0] - pix2[0], second iteration
+        ld1             {v7.16b}, [x6], x3              // Load pix2[0 + stride], second iteration
+        usubl2          v26.8h, v4.16b, v5.16b          // Signed difference pix1[0] - pix2[0], second iteration
+        usubl           v25.8h, v6.8b, v7.8b            // Signed difference pix1[0 + stride] - pix2[0 + stride], second iteration
+        ld1             {v17.16b}, [x1], x3             // Load pix1[0], third iteration
+        usubl2          v24.8h, v6.16b, v7.16b          // Signed difference pix1[0 + stride] - pix2[0 + stride], second iteration
+        ld1             {v18.16b}, [x2], x3             // Load pix2[0], second iteration
+        saba            v16.8h, v27.8h, v25.8h          // Signed absolute difference and accumulate the result. second iteration
+        ld1             {v19.16b}, [x5], x3             // Load pix1[0 + stride], third iteration
+        saba            v16.8h, v26.8h, v24.8h          // Signed absolute difference and accumulate the result. second iteration
+        usubl           v23.8h, v17.8b, v18.8b          // Signed difference pix1[0] - pix2[0], third iteration
+        ld1             {v20.16b}, [x6], x3             // Load pix2[0 + stride], third iteration
+        usubl2          v22.8h, v17.16b, v18.16b        // Signed difference pix1[0] - pix2[0], third iteration
+        usubl           v21.8h, v19.8b, v20.8b          // Signed difference pix1[0 + stride] - pix2[0 + stride], third iteration
+        sub             w4, w4, #3                      // h -= 3
+        saba            v16.8h, v23.8h, v21.8h          // Signed absolute difference and accumulate the result. third iteration
+        usubl2          v31.8h, v19.16b, v20.16b        // Signed difference pix1[0 + stride] - pix2[0 + stride], third iteration
+        cmp             w4, #3
+        saba            v16.8h, v22.8h, v31.8h          // Signed absolute difference and accumulate the result. third iteration
+
+        b.ge            1b
+        cbz             w4, 3f
+2:
+
+        ld1             {v0.16b}, [x1], x3
+        ld1             {v1.16b}, [x2], x3
+        ld1             {v2.16b}, [x5], x3
+        usubl           v30.8h, v0.8b, v1.8b
+        ld1             {v3.16b}, [x6], x3
+        usubl2          v29.8h, v0.16b, v1.16b
+        usubl           v28.8h, v2.8b, v3.8b
+        usubl2          v27.8h, v2.16b, v3.16b
+        saba            v16.8h, v30.8h, v28.8h
+        subs            w4, w4, #1
+        saba            v16.8h, v29.8h, v27.8h
+
+        b.ne            2b
+3:
+        uaddlv          s17, v16.8h
+        fmov            w0, s17
+
+        ret
+endfunc