diff mbox series

[FFmpeg-devel,2/7] avcodec/aarch64/mpegvideoencdsp: add neon implementations for pix_sum and pix_norm1

Message ID 20240818201326.100492-2-ramiro.polla@gmail.com
State New
Headers show
Series [FFmpeg-devel,1/7] checkasm/mpegvideoencdsp: add pix_sum and pix_norm1 | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Ramiro Polla Aug. 18, 2024, 8:13 p.m. UTC
A53             A76
pix_norm1_c:     519.2           231.5
pix_norm1_neon:  195.0 ( 2.66x)   44.2 ( 5.24x)
pix_sum_c:       344.5           242.2
pix_sum_neon:    119.0 ( 2.89x)   41.7 ( 5.81x)
---
 libavcodec/aarch64/Makefile               |  2 +
 libavcodec/aarch64/mpegvideoencdsp_init.c | 39 +++++++++++++
 libavcodec/aarch64/mpegvideoencdsp_neon.S | 67 +++++++++++++++++++++++
 libavcodec/mpegvideoencdsp.c              |  4 +-
 libavcodec/mpegvideoencdsp.h              |  2 +
 5 files changed, 113 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/aarch64/mpegvideoencdsp_init.c
 create mode 100644 libavcodec/aarch64/mpegvideoencdsp_neon.S

Comments

Ramiro Polla Aug. 18, 2024, 8:20 p.m. UTC | #1
On Sun, Aug 18, 2024 at 10:13 PM Ramiro Polla <ramiro.polla@gmail.com> wrote:
>
>                    A53             A76
> pix_norm1_c:     519.2           231.5
> pix_norm1_neon:  195.0 ( 2.66x)   44.2 ( 5.24x)
> pix_sum_c:       344.5           242.2
> pix_sum_neon:    119.0 ( 2.89x)   41.7 ( 5.81x)

This new patchset no longer uses unrolled loops. Even though checkasm
reported the unrolled versions to be faster, in a real encoding
use-case linux perf reports that the non-unrolled versions are faster.
Martin Storsjö Aug. 18, 2024, 8:43 p.m. UTC | #2
On Sun, 18 Aug 2024, Ramiro Polla wrote:

>                   A53             A76
> pix_norm1_c:     519.2           231.5
> pix_norm1_neon:  195.0 ( 2.66x)   44.2 ( 5.24x)
> pix_sum_c:       344.5           242.2
> pix_sum_neon:    119.0 ( 2.89x)   41.7 ( 5.81x)
> ---

Hmm, those speedups on the A53 look quite small. I guess that's because 
this isn't unrolled at all, as you mention. Especially for A53, I would 
expect unrolling to have a very large effect here. But it sounds weird if 
you say perf indicates that it is slower in real world use. Yes, unrolling 
does make the code use more space and makes the I-cache less efficient, 
but in this case it would only be a difference of like 2 instructions?


> diff --git a/libavcodec/aarch64/mpegvideoencdsp_neon.S b/libavcodec/aarch64/mpegvideoencdsp_neon.S
> new file mode 100644
> index 0000000000..89e50e29b3
> --- /dev/null
> +++ b/libavcodec/aarch64/mpegvideoencdsp_neon.S
> @@ -0,0 +1,67 @@
> +/*
> + * Copyright (c) 2024 Ramiro Polla
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +
> +function ff_pix_sum16_neon, export=1
> +// x0  const uint8_t *pix
> +// x1  int line_size
> +
> +        sxtw            x1, w1
> +        movi            v0.16b, #0
> +        mov             w2, #16
> +
> +1:
> +        ld1             { v1.16b }, [x0], x1

Nit; we usually don't have these {} written with spaces inside of the 
braces, same below.

> +        subs            w2, w2, #1
> +        uadalp          v0.8h, v1.16b
> +        b.ne            1b
> +
> +        uaddlp          v0.4s, v0.8h
> +        uaddlv          d0, v0.4s

Couldn't this be aggregated with just one instruction, "uaddlv s0, v0.8h"? 
There's no need to widen it to 64 bit as we're truncating the returned 
value to 32 bit anyway.

// Martin
Ramiro Polla Aug. 18, 2024, 9:58 p.m. UTC | #3
On Sun, Aug 18, 2024 at 10:43 PM Martin Storsjö <martin@martin.st> wrote:
> On Sun, 18 Aug 2024, Ramiro Polla wrote:
>
> >                   A53             A76
> > pix_norm1_c:     519.2           231.5
> > pix_norm1_neon:  195.0 ( 2.66x)   44.2 ( 5.24x)
> > pix_sum_c:       344.5           242.2
> > pix_sum_neon:    119.0 ( 2.89x)   41.7 ( 5.81x)
> > ---
>
> Hmm, those speedups on the A53 look quite small. I guess that's because
> this isn't unrolled at all, as you mention. Especially for A53, I would
> expect unrolling to have a very large effect here. But it sounds weird if
> you say perf indicates that it is slower in real world use. Yes, unrolling
> does make the code use more space and makes the I-cache less efficient,
> but in this case it would only be a difference of like 2 instructions?

These are the checkasm benchmarks I got for the unrolled version with
manual instruction ordering to give better results on the A53 (patch
attached for reference):
                      A53             A76
pix_norm1_c:        519.0           231.7
pix_norm1_neon:     140.0 ( 3.71x)   41.5 ( 5.58x)
pix_norm1_dotprod:                   17.2 (13.47x)
pix_sum_c:          347.2           242.0
pix_sum_neon:        72.0 ( 4.82x)   21.0 (11.52x)

I had tested the real world case on the A76, but not on the A53. I
spent a couple of hours with perf trying to find the source of the
discrepancy but I couldn't find anything conclusive. I need to learn
more about how to test cache misses.

I just tested again with the following command:
$ taskset -c 2 ./ffmpeg_g -benchmark -f lavfi -i
"testsrc2=size=1920x1080" -vcodec mpeg4 -q 31 -vframes 100 -f rawvideo
-y /dev/null

The entire test was about 1% faster unrolled on A53, but about 1%
slower unrolled on A76 (I had the Raspberry Pi 5 in mind for these
optimizations, so I preferred choosing the version that was faster on
the A76). I wonder if there is any way we could check at runtime. The
problem is also that I don't even know for certain what is causing
this.

> > diff --git a/libavcodec/aarch64/mpegvideoencdsp_neon.S b/libavcodec/aarch64/mpegvideoencdsp_neon.S
> > new file mode 100644
> > index 0000000000..89e50e29b3
> > --- /dev/null
> > +++ b/libavcodec/aarch64/mpegvideoencdsp_neon.S
> > @@ -0,0 +1,67 @@
> > +/*
> > + * Copyright (c) 2024 Ramiro Polla
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> > + */
> > +
> > +#include "libavutil/aarch64/asm.S"
> > +
> > +function ff_pix_sum16_neon, export=1
> > +// x0  const uint8_t *pix
> > +// x1  int line_size
> > +
> > +        sxtw            x1, w1
> > +        movi            v0.16b, #0
> > +        mov             w2, #16
> > +
> > +1:
> > +        ld1             { v1.16b }, [x0], x1
>
> Nit; we usually don't have these {} written with spaces inside of the
> braces, same below.

Oops, I should check my other neon code then...

> > +        subs            w2, w2, #1
> > +        uadalp          v0.8h, v1.16b
> > +        b.ne            1b
> > +
> > +        uaddlp          v0.4s, v0.8h
> > +        uaddlv          d0, v0.4s
>
> Couldn't this be aggregated with just one instruction, "uaddlv s0, v0.8h"?
> There's no need to widen it to 64 bit as we're truncating the returned
> value to 32 bit anyway.

Yes, that works. I'll fix it in the next iteration.
Rémi Denis-Courmont Aug. 19, 2024, 5:24 a.m. UTC | #4
Le 18 août 2024 23:13:21 GMT+03:00, Ramiro Polla <ramiro.polla@gmail.com> a écrit :
>                   A53             A76
>pix_norm1_c:     519.2           231.5
>pix_norm1_neon:  195.0 ( 2.66x)   44.2 ( 5.24x)
>pix_sum_c:       344.5           242.2
>pix_sum_neon:    119.0 ( 2.89x)   41.7 ( 5.81x)
>---
> libavcodec/aarch64/Makefile               |  2 +
> libavcodec/aarch64/mpegvideoencdsp_init.c | 39 +++++++++++++
> libavcodec/aarch64/mpegvideoencdsp_neon.S | 67 +++++++++++++++++++++++
> libavcodec/mpegvideoencdsp.c              |  4 +-
> libavcodec/mpegvideoencdsp.h              |  2 +
> 5 files changed, 113 insertions(+), 1 deletion(-)
> create mode 100644 libavcodec/aarch64/mpegvideoencdsp_init.c
> create mode 100644 libavcodec/aarch64/mpegvideoencdsp_neon.S
>
>diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
>index a3256bb1cc..de0653ebbc 100644
>--- a/libavcodec/aarch64/Makefile
>+++ b/libavcodec/aarch64/Makefile
>@@ -10,6 +10,7 @@ OBJS-$(CONFIG_HPELDSP)                  += aarch64/hpeldsp_init_aarch64.o
> OBJS-$(CONFIG_IDCTDSP)                  += aarch64/idctdsp_init_aarch64.o
> OBJS-$(CONFIG_ME_CMP)                   += aarch64/me_cmp_init_aarch64.o
> OBJS-$(CONFIG_MPEGAUDIODSP)             += aarch64/mpegaudiodsp_init.o
>+OBJS-$(CONFIG_MPEGVIDEOENC)             += aarch64/mpegvideoencdsp_init.o
> OBJS-$(CONFIG_NEON_CLOBBER_TEST)        += aarch64/neontest.o
> OBJS-$(CONFIG_PIXBLOCKDSP)              += aarch64/pixblockdsp_init_aarch64.o
> OBJS-$(CONFIG_VIDEODSP)                 += aarch64/videodsp_init.o
>@@ -51,6 +52,7 @@ NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/idctdsp_neon.o              \
>                                            aarch64/simple_idct_neon.o
> NEON-OBJS-$(CONFIG_ME_CMP)              += aarch64/me_cmp_neon.o
> NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
>+NEON-OBJS-$(CONFIG_MPEGVIDEOENC)        += aarch64/mpegvideoencdsp_neon.o
> NEON-OBJS-$(CONFIG_PIXBLOCKDSP)         += aarch64/pixblockdsp_neon.o
> NEON-OBJS-$(CONFIG_VC1DSP)              += aarch64/vc1dsp_neon.o
> NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
>diff --git a/libavcodec/aarch64/mpegvideoencdsp_init.c b/libavcodec/aarch64/mpegvideoencdsp_init.c
>new file mode 100644
>index 0000000000..7eb632ed1b
>--- /dev/null
>+++ b/libavcodec/aarch64/mpegvideoencdsp_init.c
>@@ -0,0 +1,39 @@
>+/*
>+ * This file is part of FFmpeg.
>+ *
>+ * FFmpeg is free software; you can redistribute it and/or
>+ * modify it under the terms of the GNU Lesser General Public
>+ * License as published by the Free Software Foundation; either
>+ * version 2.1 of the License, or (at your option) any later version.
>+ *
>+ * FFmpeg is distributed in the hope that it will be useful,
>+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
>+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>+ * Lesser General Public License for more details.
>+ *
>+ * You should have received a copy of the GNU Lesser General Public
>+ * License along with FFmpeg; if not, write to the Free Software
>+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>+ */
>+
>+#include <stddef.h>
>+#include <stdint.h>
>+
>+#include "libavutil/attributes.h"
>+#include "libavutil/aarch64/cpu.h"
>+#include "libavcodec/mpegvideoencdsp.h"
>+#include "config.h"
>+
>+int ff_pix_sum16_neon(const uint8_t *pix, int line_size);
>+int ff_pix_norm1_neon(const uint8_t *pix, int line_size);
>+
>+av_cold void ff_mpegvideoencdsp_init_aarch64(MpegvideoEncDSPContext *c,
>+                                             AVCodecContext *avctx)
>+{
>+    int cpu_flags = av_get_cpu_flags();
>+
>+    if (have_neon(cpu_flags)) {
>+        c->pix_sum   = ff_pix_sum16_neon;
>+        c->pix_norm1 = ff_pix_norm1_neon;
>+    }
>+}
>diff --git a/libavcodec/aarch64/mpegvideoencdsp_neon.S b/libavcodec/aarch64/mpegvideoencdsp_neon.S
>new file mode 100644
>index 0000000000..89e50e29b3
>--- /dev/null
>+++ b/libavcodec/aarch64/mpegvideoencdsp_neon.S
>@@ -0,0 +1,67 @@
>+/*
>+ * Copyright (c) 2024 Ramiro Polla
>+ *
>+ * This file is part of FFmpeg.
>+ *
>+ * FFmpeg is free software; you can redistribute it and/or
>+ * modify it under the terms of the GNU Lesser General Public
>+ * License as published by the Free Software Foundation; either
>+ * version 2.1 of the License, or (at your option) any later version.
>+ *
>+ * FFmpeg is distributed in the hope that it will be useful,
>+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
>+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>+ * Lesser General Public License for more details.
>+ *
>+ * You should have received a copy of the GNU Lesser General Public
>+ * License along with FFmpeg; if not, write to the Free Software
>+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>+ */
>+
>+#include "libavutil/aarch64/asm.S"
>+
>+function ff_pix_sum16_neon, export=1
>+// x0  const uint8_t *pix
>+// x1  int line_size
>+
>+        sxtw            x1, w1

Can we fix the scan line size to ptrdiff_t to avoid this? Or is it too difficult to deal with existing optimisations?

>+        movi            v0.16b, #0
>+        mov             w2, #16
>+
>+1:
>+        ld1             { v1.16b }, [x0], x1
>+        subs            w2, w2, #1
>+        uadalp          v0.8h, v1.16b
>+        b.ne            1b
>+
>+        uaddlp          v0.4s, v0.8h
>+        uaddlv          d0, v0.4s
>+        fmov            w0, s0
>+
>+        ret
>+endfunc
>+
>+function ff_pix_norm1_neon, export=1
>+// x0  const uint8_t *pix
>+// x1  int line_size
>+
>+        sxtw            x1, w1
>+        movi            v4.16b, #0
>+        movi            v5.16b, #0
>+        mov             w2, #16
>+
>+1:
>+        ld1             { v1.16b }, [x0], x1
>+        subs            w2, w2, #1
>+        umull           v2.8h, v1.8b,  v1.8b
>+        umull2          v3.8h, v1.16b, v1.16b
>+        uadalp          v4.4s, v2.8h
>+        uadalp          v5.4s, v3.8h
>+        b.ne            1b
>+
>+        add             v0.4s, v4.4s, v5.4s
>+        uaddlv          d0, v0.4s
>+        fmov            w0, s0
>+
>+        ret
>+endfunc
>diff --git a/libavcodec/mpegvideoencdsp.c b/libavcodec/mpegvideoencdsp.c
>index 997d048663..a96f0b6436 100644
>--- a/libavcodec/mpegvideoencdsp.c
>+++ b/libavcodec/mpegvideoencdsp.c
>@@ -245,7 +245,9 @@ av_cold void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
> 
>     c->draw_edges = draw_edges_8_c;
> 
>-#if ARCH_ARM
>+#if ARCH_AARCH64
>+    ff_mpegvideoencdsp_init_aarch64(c, avctx);
>+#elif ARCH_ARM
>     ff_mpegvideoencdsp_init_arm(c, avctx);
> #elif ARCH_PPC
>     ff_mpegvideoencdsp_init_ppc(c, avctx);
>diff --git a/libavcodec/mpegvideoencdsp.h b/libavcodec/mpegvideoencdsp.h
>index 95084679d9..63dbd39603 100644
>--- a/libavcodec/mpegvideoencdsp.h
>+++ b/libavcodec/mpegvideoencdsp.h
>@@ -46,6 +46,8 @@ typedef struct MpegvideoEncDSPContext {
> 
> void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
>                              AVCodecContext *avctx);
>+void ff_mpegvideoencdsp_init_aarch64(MpegvideoEncDSPContext *c,
>+                                     AVCodecContext *avctx);
> void ff_mpegvideoencdsp_init_arm(MpegvideoEncDSPContext *c,
>                                  AVCodecContext *avctx);
> void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c,
Martin Storsjö Aug. 19, 2024, 9:26 a.m. UTC | #5
On Sun, 18 Aug 2024, Ramiro Polla wrote:

> I had tested the real world case on the A76, but not on the A53. I
> spent a couple of hours with perf trying to find the source of the
> discrepancy but I couldn't find anything conclusive. I need to learn
> more about how to test cache misses.

Nah, I guess that's a bit overkill...

> I just tested again with the following command:
> $ taskset -c 2 ./ffmpeg_g -benchmark -f lavfi -i
> "testsrc2=size=1920x1080" -vcodec mpeg4 -q 31 -vframes 100 -f rawvideo
> -y /dev/null
>
> The entire test was about 1% faster unrolled on A53, but about 1%
> slower unrolled on A76 (I had the Raspberry Pi 5 in mind for these
> optimizations, so I preferred choosing the version that was faster on
> the A76).

> I wonder if there is any way we could check at runtime.

There are indeed often cases where functions could be tuned differently 
for older/newer or in-order/out-of-order cores. In most cases, trying to 
specialize things is a bit waste and overkill though - in most cases, I'd 
just suggest going with a compromise.

(Sometimes, different kinds of tunings can be applied if you use e.g. the 
flag dotprod to differentiate between older and newer cores. But it's 
seldom worth the extra effort to do that.)


Right, so looking at your unrolled case, you've done a full unroll. That's 
probably also a bit overkill.

The in-order cores really hate tight loops where almost everything has a 
sequential dependency on the previous instruction - so the general rule of 
thumb is that you'll want to unroll by a factor of 2, unless the algorithm 
itself has enough complexity that there's two separate dependency chains 
interlinked.

Also, from your unrolled version, there's a slight bug in it:

> +        add             x2, x0, w1, sxtw
> +        lsl             w1, w1, #1

If the stride is a negative number, the first sxtw does the right thing, 
but the "lsl w1, w1, #1" will zero out the upper half of the register.

So for that, you'd still need to keep the "sxtw x1, w1" instruction, and 
do the lsl on x1 instead. It is actually possible to merge it into one 
instruction though, with "sbfiz x1, x1, #1, #32", if I read the docs 
right. But that's a much more uncommon instruction...

As for optimal performance here - I tried something like this:

         movi            v0.16b, #0
         add             x2, x0, w1, sxtw
         sbfiz           x1, x1, #1, #32
         mov             w3, #16

1:
         ld1             {v1.16b}, [x0], x1
         ld1             {v2.16b}, [x2], x1
         subs            w3, w3, #2
         uadalp          v0.8h, v1.16b
         uadalp          v0.8h, v2.16b
         b.ne            1b

         uaddlv          s0, v0.8h
         fmov            w0, s0

         ret

With this, I'm down from your 120 cycles on the A53 originally, to 78.7 
cycles now. Your fully unrolled version seemed to run in 72 cycles on the 
A53, so that's obviously even faster, but I think this kind of tradeoff 
might be the sweet spot. What does such a version give you in terms of 
real world speed?

On this version, you can also note that the two sequential uadalp may seem 
a little potentially problematic. I did try using two separate accumulator 
registers, accumulating into v0 and v1 separately here, and only summing 
them at the end. That didn't make any difference, so the A53 may 
potentially have a special case where two such sequential accumulations 
into the same register doesn't incur the extra full latency. (The A53 does 
have such a case for "mla" at least.)

// Martin
Ramiro Polla Aug. 19, 2024, noon UTC | #6
On Mon, Aug 19, 2024 at 11:46 AM Martin Storsjö <martin@martin.st> wrote:
> On Sun, 18 Aug 2024, Ramiro Polla wrote:
> > I had tested the real world case on the A76, but not on the A53. I
> > spent a couple of hours with perf trying to find the source of the
> > discrepancy but I couldn't find anything conclusive. I need to learn
> > more about how to test cache misses.
>
> Nah, I guess that's a bit overkill...
>
> > I just tested again with the following command:
> > $ taskset -c 2 ./ffmpeg_g -benchmark -f lavfi -i
> > "testsrc2=size=1920x1080" -vcodec mpeg4 -q 31 -vframes 100 -f rawvideo
> > -y /dev/null
> >
> > The entire test was about 1% faster unrolled on A53, but about 1%
> > slower unrolled on A76 (I had the Raspberry Pi 5 in mind for these
> > optimizations, so I preferred choosing the version that was faster on
> > the A76).
>
> > I wonder if there is any way we could check at runtime.
>
> There are indeed often cases where functions could be tuned differently
> for older/newer or in-order/out-of-order cores. In most cases, trying to
> specialize things is a bit waste and overkill though - in most cases, I'd
> just suggest going with a compromise.
>
> (Sometimes, different kinds of tunings can be applied if you use e.g. the
> flag dotprod to differentiate between older and newer cores. But it's
> seldom worth the extra effort to do that.)
>
>
> Right, so looking at your unrolled case, you've done a full unroll. That's
> probably also a bit overkill.
>
> The in-order cores really hate tight loops where almost everything has a
> sequential dependency on the previous instruction - so the general rule of
> thumb is that you'll want to unroll by a factor of 2, unless the algorithm
> itself has enough complexity that there's two separate dependency chains
> interlinked.
>
> Also, from your unrolled version, there's a slight bug in it:
>
> > +        add             x2, x0, w1, sxtw
> > +        lsl             w1, w1, #1
>
> If the stride is a negative number, the first sxtw does the right thing,
> but the "lsl w1, w1, #1" will zero out the upper half of the register.

I'll start adding negative stride tests to checkasm to spot these bugs.

> So for that, you'd still need to keep the "sxtw x1, w1" instruction, and
> do the lsl on x1 instead. It is actually possible to merge it into one
> instruction though, with "sbfiz x1, x1, #1, #32", if I read the docs
> right. But that's a much more uncommon instruction...
>
> As for optimal performance here - I tried something like this:
>
>          movi            v0.16b, #0
>          add             x2, x0, w1, sxtw
>          sbfiz           x1, x1, #1, #32
>          mov             w3, #16
>
> 1:
>          ld1             {v1.16b}, [x0], x1
>          ld1             {v2.16b}, [x2], x1
>          subs            w3, w3, #2
>          uadalp          v0.8h, v1.16b
>          uadalp          v0.8h, v2.16b
>          b.ne            1b
>
>          uaddlv          s0, v0.8h
>          fmov            w0, s0
>
>          ret
>
> With this, I'm down from your 120 cycles on the A53 originally, to 78.7
> cycles now. Your fully unrolled version seemed to run in 72 cycles on the
> A53, so that's obviously even faster, but I think this kind of tradeoff
> might be the sweet spot. What does such a version give you in terms of
> real world speed?

This version is around 0.5% slower overall on the A76. Very roughly
these are the total times taken by pix_sum and pix_norm1 with the
different implementations on A76:
c: ~5%
fully unrolled: ~3%
unroll 2: 2.5%
tight loop: 2%
Martin Storsjö Aug. 19, 2024, 12:08 p.m. UTC | #7
On Mon, 19 Aug 2024, Ramiro Polla wrote:

>> If the stride is a negative number, the first sxtw does the right thing,
>> but the "lsl w1, w1, #1" will zero out the upper half of the register.
>
> I'll start adding negative stride tests to checkasm to spot these bugs.

That's probably useful. The other alternative is to transition these cases 
to use ptrdiff_t for the stride, which should be register sized, so most 
of the sign extension issues around strides go away. (We've transitioned 
lots of preexisting DSP interfaces already, so doing that here would just 
be the next logical step. But at times, this may require marginal 
touch-ups to existing assembly, or at least allows getting rid of such 
sign extensions later.)

>> With this, I'm down from your 120 cycles on the A53 originally, to 78.7
>> cycles now. Your fully unrolled version seemed to run in 72 cycles on the
>> A53, so that's obviously even faster, but I think this kind of tradeoff
>> might be the sweet spot. What does such a version give you in terms of
>> real world speed?
>
> This version is around 0.5% slower overall on the A76. Very roughly
> these are the total times taken by pix_sum and pix_norm1 with the
> different implementations on A76:
> c: ~5%
> fully unrolled: ~3%
> unroll 2: 2.5%
> tight loop: 2%

Ok. Given the tradeoff between various different cores (including ones not 
tested here), do you think this version would be a reasonable compromise 
(giving almost ideal results on in-order cores, and not too much slowdown 
on out-of-order cores in this benchmark)?

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index a3256bb1cc..de0653ebbc 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -10,6 +10,7 @@  OBJS-$(CONFIG_HPELDSP)                  += aarch64/hpeldsp_init_aarch64.o
 OBJS-$(CONFIG_IDCTDSP)                  += aarch64/idctdsp_init_aarch64.o
 OBJS-$(CONFIG_ME_CMP)                   += aarch64/me_cmp_init_aarch64.o
 OBJS-$(CONFIG_MPEGAUDIODSP)             += aarch64/mpegaudiodsp_init.o
+OBJS-$(CONFIG_MPEGVIDEOENC)             += aarch64/mpegvideoencdsp_init.o
 OBJS-$(CONFIG_NEON_CLOBBER_TEST)        += aarch64/neontest.o
 OBJS-$(CONFIG_PIXBLOCKDSP)              += aarch64/pixblockdsp_init_aarch64.o
 OBJS-$(CONFIG_VIDEODSP)                 += aarch64/videodsp_init.o
@@ -51,6 +52,7 @@  NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/idctdsp_neon.o              \
                                            aarch64/simple_idct_neon.o
 NEON-OBJS-$(CONFIG_ME_CMP)              += aarch64/me_cmp_neon.o
 NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
+NEON-OBJS-$(CONFIG_MPEGVIDEOENC)        += aarch64/mpegvideoencdsp_neon.o
 NEON-OBJS-$(CONFIG_PIXBLOCKDSP)         += aarch64/pixblockdsp_neon.o
 NEON-OBJS-$(CONFIG_VC1DSP)              += aarch64/vc1dsp_neon.o
 NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
diff --git a/libavcodec/aarch64/mpegvideoencdsp_init.c b/libavcodec/aarch64/mpegvideoencdsp_init.c
new file mode 100644
index 0000000000..7eb632ed1b
--- /dev/null
+++ b/libavcodec/aarch64/mpegvideoencdsp_init.c
@@ -0,0 +1,39 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/mpegvideoencdsp.h"
+#include "config.h"
+
+int ff_pix_sum16_neon(const uint8_t *pix, int line_size);
+int ff_pix_norm1_neon(const uint8_t *pix, int line_size);
+
+av_cold void ff_mpegvideoencdsp_init_aarch64(MpegvideoEncDSPContext *c,
+                                             AVCodecContext *avctx)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        c->pix_sum   = ff_pix_sum16_neon;
+        c->pix_norm1 = ff_pix_norm1_neon;
+    }
+}
diff --git a/libavcodec/aarch64/mpegvideoencdsp_neon.S b/libavcodec/aarch64/mpegvideoencdsp_neon.S
new file mode 100644
index 0000000000..89e50e29b3
--- /dev/null
+++ b/libavcodec/aarch64/mpegvideoencdsp_neon.S
@@ -0,0 +1,67 @@ 
+/*
+ * Copyright (c) 2024 Ramiro Polla
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_pix_sum16_neon, export=1
+// x0  const uint8_t *pix
+// x1  int line_size
+
+        sxtw            x1, w1
+        movi            v0.16b, #0
+        mov             w2, #16
+
+1:
+        ld1             { v1.16b }, [x0], x1
+        subs            w2, w2, #1
+        uadalp          v0.8h, v1.16b
+        b.ne            1b
+
+        uaddlp          v0.4s, v0.8h
+        uaddlv          d0, v0.4s
+        fmov            w0, s0
+
+        ret
+endfunc
+
+function ff_pix_norm1_neon, export=1
+// x0  const uint8_t *pix
+// x1  int line_size
+
+        sxtw            x1, w1
+        movi            v4.16b, #0
+        movi            v5.16b, #0
+        mov             w2, #16
+
+1:
+        ld1             { v1.16b }, [x0], x1
+        subs            w2, w2, #1
+        umull           v2.8h, v1.8b,  v1.8b
+        umull2          v3.8h, v1.16b, v1.16b
+        uadalp          v4.4s, v2.8h
+        uadalp          v5.4s, v3.8h
+        b.ne            1b
+
+        add             v0.4s, v4.4s, v5.4s
+        uaddlv          d0, v0.4s
+        fmov            w0, s0
+
+        ret
+endfunc
diff --git a/libavcodec/mpegvideoencdsp.c b/libavcodec/mpegvideoencdsp.c
index 997d048663..a96f0b6436 100644
--- a/libavcodec/mpegvideoencdsp.c
+++ b/libavcodec/mpegvideoencdsp.c
@@ -245,7 +245,9 @@  av_cold void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
 
     c->draw_edges = draw_edges_8_c;
 
-#if ARCH_ARM
+#if ARCH_AARCH64
+    ff_mpegvideoencdsp_init_aarch64(c, avctx);
+#elif ARCH_ARM
     ff_mpegvideoencdsp_init_arm(c, avctx);
 #elif ARCH_PPC
     ff_mpegvideoencdsp_init_ppc(c, avctx);
diff --git a/libavcodec/mpegvideoencdsp.h b/libavcodec/mpegvideoencdsp.h
index 95084679d9..63dbd39603 100644
--- a/libavcodec/mpegvideoencdsp.h
+++ b/libavcodec/mpegvideoencdsp.h
@@ -46,6 +46,8 @@  typedef struct MpegvideoEncDSPContext {
 
 void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
                              AVCodecContext *avctx);
+void ff_mpegvideoencdsp_init_aarch64(MpegvideoEncDSPContext *c,
+                                     AVCodecContext *avctx);
 void ff_mpegvideoencdsp_init_arm(MpegvideoEncDSPContext *c,
                                  AVCodecContext *avctx);
 void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c,