diff mbox series

[FFmpeg-devel,v2,3/7] avcodec/aarch64/mpegvideoencdsp: add neon implementations for pix_sum and pix_norm1

Message ID 20240821145555.235323-4-ramiro.polla@gmail.com
State New
Headers show
Series avcodec/mpegvideoencdsp improvements | expand

Commit Message

Ramiro Polla Aug. 21, 2024, 2:55 p.m. UTC
A55             A76
pix_norm1_c:     484.3           235.2
pix_norm1_neon:  193.8 ( 2.50x)   44.7 ( 5.26x)
pix_sum_c:       302.8           243.7
pix_sum_neon:     81.6 ( 3.71x)   26.0 ( 9.37x)
---
 libavcodec/aarch64/Makefile               |  2 +
 libavcodec/aarch64/mpegvideoencdsp_init.c | 39 +++++++++++++
 libavcodec/aarch64/mpegvideoencdsp_neon.S | 69 +++++++++++++++++++++++
 libavcodec/mpegvideoencdsp.c              |  4 +-
 libavcodec/mpegvideoencdsp.h              |  2 +
 5 files changed, 115 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/aarch64/mpegvideoencdsp_init.c
 create mode 100644 libavcodec/aarch64/mpegvideoencdsp_neon.S

Comments

Ramiro Polla Aug. 21, 2024, 5:41 p.m. UTC | #1
On Wed, Aug 21, 2024 at 6:41 PM Martin Storsjö <martin@martin.st> wrote:
> On Wed, 21 Aug 2024, Ramiro Polla wrote:
> >                   A55             A76
> > pix_norm1_c:     484.3           235.2
> > pix_norm1_neon:  193.8 ( 2.50x)   44.7 ( 5.26x)
> > pix_sum_c:       302.8           243.7
> > pix_sum_neon:     81.6 ( 3.71x)   26.0 ( 9.37x)
> > ---
> > libavcodec/aarch64/Makefile               |  2 +
> > libavcodec/aarch64/mpegvideoencdsp_init.c | 39 +++++++++++++
> > libavcodec/aarch64/mpegvideoencdsp_neon.S | 69 +++++++++++++++++++++++
> > libavcodec/mpegvideoencdsp.c              |  4 +-
> > libavcodec/mpegvideoencdsp.h              |  2 +
> > 5 files changed, 115 insertions(+), 1 deletion(-)
> > create mode 100644 libavcodec/aarch64/mpegvideoencdsp_init.c
> > create mode 100644 libavcodec/aarch64/mpegvideoencdsp_neon.S
> >
> > diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
> > index a3256bb1cc..de0653ebbc 100644
> > --- a/libavcodec/aarch64/Makefile
> > +++ b/libavcodec/aarch64/Makefile
> > @@ -10,6 +10,7 @@ OBJS-$(CONFIG_HPELDSP)                  += aarch64/hpeldsp_init_aarch64.o
> > OBJS-$(CONFIG_IDCTDSP)                  += aarch64/idctdsp_init_aarch64.o
> > OBJS-$(CONFIG_ME_CMP)                   += aarch64/me_cmp_init_aarch64.o
> > OBJS-$(CONFIG_MPEGAUDIODSP)             += aarch64/mpegaudiodsp_init.o
> > +OBJS-$(CONFIG_MPEGVIDEOENC)             += aarch64/mpegvideoencdsp_init.o
> > OBJS-$(CONFIG_NEON_CLOBBER_TEST)        += aarch64/neontest.o
> > OBJS-$(CONFIG_PIXBLOCKDSP)              += aarch64/pixblockdsp_init_aarch64.o
> > OBJS-$(CONFIG_VIDEODSP)                 += aarch64/videodsp_init.o
> > @@ -51,6 +52,7 @@ NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/idctdsp_neon.o              \
> >                                            aarch64/simple_idct_neon.o
> > NEON-OBJS-$(CONFIG_ME_CMP)              += aarch64/me_cmp_neon.o
> > NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
> > +NEON-OBJS-$(CONFIG_MPEGVIDEOENC)        += aarch64/mpegvideoencdsp_neon.o
> > NEON-OBJS-$(CONFIG_PIXBLOCKDSP)         += aarch64/pixblockdsp_neon.o
> > NEON-OBJS-$(CONFIG_VC1DSP)              += aarch64/vc1dsp_neon.o
> > NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
> > diff --git a/libavcodec/aarch64/mpegvideoencdsp_init.c b/libavcodec/aarch64/mpegvideoencdsp_init.c
> > new file mode 100644
> > index 0000000000..7eb632ed1b
> > --- /dev/null
> > +++ b/libavcodec/aarch64/mpegvideoencdsp_init.c
> > @@ -0,0 +1,39 @@
> > +/*
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> > + */
> > +
> > +#include <stddef.h>
> > +#include <stdint.h>
> > +
> > +#include "libavutil/attributes.h"
> > +#include "libavutil/aarch64/cpu.h"
> > +#include "libavcodec/mpegvideoencdsp.h"
> > +#include "config.h"
> > +
> > +int ff_pix_sum16_neon(const uint8_t *pix, int line_size);
> > +int ff_pix_norm1_neon(const uint8_t *pix, int line_size);
> > +
> > +av_cold void ff_mpegvideoencdsp_init_aarch64(MpegvideoEncDSPContext *c,
> > +                                             AVCodecContext *avctx)
> > +{
> > +    int cpu_flags = av_get_cpu_flags();
> > +
> > +    if (have_neon(cpu_flags)) {
> > +        c->pix_sum   = ff_pix_sum16_neon;
> > +        c->pix_norm1 = ff_pix_norm1_neon;
> > +    }
> > +}
> > diff --git a/libavcodec/aarch64/mpegvideoencdsp_neon.S b/libavcodec/aarch64/mpegvideoencdsp_neon.S
> > new file mode 100644
> > index 0000000000..6e7a9319ba
> > --- /dev/null
> > +++ b/libavcodec/aarch64/mpegvideoencdsp_neon.S
> > @@ -0,0 +1,69 @@
> > +/*
> > + * Copyright (c) 2024 Ramiro Polla
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> > + */
> > +
> > +#include "libavutil/aarch64/asm.S"
> > +
> > +function ff_pix_sum16_neon, export=1
> > +// x0  const uint8_t *pix
> > +// x1  int line_size
> > +
> > +        add             x2, x0, w1, sxtw
> > +        sbfiz           x1, x1, #1, #32
>
> BTW, this instruction is kinda exotic and the docs aren't super clear, so
> it'd be good to test manually that it really does what we want, for
> negative numbers and numbers close to the ends of the value range; I
> didn't do that manually yet.

I prefer just sticking to sxtw + lsl then. When we move to ptrdiff_t
the sxtw will be gone anyway.

> > +        movi            v0.16b, #0
> > +        mov             w3, #16
> > +
> > +1:
> > +        ld1             {v1.16b}, [x0], x1
> > +        ld1             {v2.16b}, [x2], x1
> > +        subs            w3, w3, #2
> > +        uadalp          v0.8h, v1.16b
> > +        uadalp          v0.8h, v2.16b
> > +        b.ne            1b
> > +
> > +        uaddlv          s0, v0.8h
> > +        fmov            w0, s0
> > +
> > +        ret
> > +endfunc
> > +
> > +function ff_pix_norm1_neon, export=1
> > +// x0  const uint8_t *pix
> > +// x1  int line_size
> > +
> > +        sxtw            x1, w1
> > +        movi            v4.16b, #0
> > +        movi            v5.16b, #0
> > +        mov             w2, #16
> > +
> > +1:
> > +        ld1             {v1.16b}, [x0], x1
> > +        subs            w2, w2, #1
> > +        umull           v2.8h, v1.8b,  v1.8b
> > +        umull2          v3.8h, v1.16b, v1.16b
> > +        uadalp          v4.4s, v2.8h
> > +        uadalp          v5.4s, v3.8h
>
> From my earlier testing on A53, it seemed (surprisingly) to be equally
> fast to accumulate into the same register for both instructions - but I
> only tested that on A53. So we could change that here, getting rid of the
> add at the end (and one movi). Or if it does help on some other core,
> perhaps we should do the same for the function above too?

Indeed, it is equally fast to accumulate into the same register on the
A55 and A76 as well.

New patches attached (patch 3/7 has functional changes, but patch 4/7
only changes the commit message to reflect the new test run).
Martin Storsjö Aug. 21, 2024, 7:44 p.m. UTC | #2
On Wed, 21 Aug 2024, Ramiro Polla wrote:

>> BTW, this instruction is kinda exotic and the docs aren't super clear, so
>> it'd be good to test manually that it really does what we want, for
>> negative numbers and numbers close to the ends of the value range; I
>> didn't do that manually yet.
>
> I prefer just sticking to sxtw + lsl then. When we move to ptrdiff_t
> the sxtw will be gone anyway.

This sounds like a very reasonable choice indeed, especially if it's 
somewhat plausible that we'll get rid of it at some point in the future.

>>> +        movi            v0.16b, #0
>>> +        mov             w3, #16
>>> +
>>> +1:
>>> +        ld1             {v1.16b}, [x0], x1
>>> +        ld1             {v2.16b}, [x2], x1
>>> +        subs            w3, w3, #2
>>> +        uadalp          v0.8h, v1.16b
>>> +        uadalp          v0.8h, v2.16b
>>> +        b.ne            1b
>>> +
>>> +        uaddlv          s0, v0.8h
>>> +        fmov            w0, s0
>>> +
>>> +        ret
>>> +endfunc
>>> +
>>> +function ff_pix_norm1_neon, export=1
>>> +// x0  const uint8_t *pix
>>> +// x1  int line_size
>>> +
>>> +        sxtw            x1, w1
>>> +        movi            v4.16b, #0
>>> +        movi            v5.16b, #0
>>> +        mov             w2, #16
>>> +
>>> +1:
>>> +        ld1             {v1.16b}, [x0], x1
>>> +        subs            w2, w2, #1
>>> +        umull           v2.8h, v1.8b,  v1.8b
>>> +        umull2          v3.8h, v1.16b, v1.16b
>>> +        uadalp          v4.4s, v2.8h
>>> +        uadalp          v5.4s, v3.8h
>>
>> From my earlier testing on A53, it seemed (surprisingly) to be equally
>> fast to accumulate into the same register for both instructions - but I
>> only tested that on A53. So we could change that here, getting rid of the
>> add at the end (and one movi). Or if it does help on some other core,
>> perhaps we should do the same for the function above too?
>
> Indeed, it is equally fast to accumulate into the same register on the
> A55 and A76 as well.
>
> New patches attached (patch 3/7 has functional changes, but patch 4/7
> only changes the commit message to reflect the new test run).

LGTM very much now, thanks! And thanks for your patience through all the 
iterations on such trivial patches as these.

// Martin
Ramiro Polla Aug. 22, 2024, 11:29 a.m. UTC | #3
On Wed, Aug 21, 2024 at 9:44 PM Martin Storsjö <martin@martin.st> wrote:
> On Wed, 21 Aug 2024, Ramiro Polla wrote:
> >> BTW, this instruction is kinda exotic and the docs aren't super clear, so
> >> it'd be good to test manually that it really does what we want, for
> >> negative numbers and numbers close to the ends of the value range; I
> >> didn't do that manually yet.
> >
> > I prefer just sticking to sxtw + lsl then. When we move to ptrdiff_t
> > the sxtw will be gone anyway.
>
> This sounds like a very reasonable choice indeed, especially if it's
> somewhat plausible that we'll get rid of it at some point in the future.
>
> >>> +        movi            v0.16b, #0
> >>> +        mov             w3, #16
> >>> +
> >>> +1:
> >>> +        ld1             {v1.16b}, [x0], x1
> >>> +        ld1             {v2.16b}, [x2], x1
> >>> +        subs            w3, w3, #2
> >>> +        uadalp          v0.8h, v1.16b
> >>> +        uadalp          v0.8h, v2.16b
> >>> +        b.ne            1b
> >>> +
> >>> +        uaddlv          s0, v0.8h
> >>> +        fmov            w0, s0
> >>> +
> >>> +        ret
> >>> +endfunc
> >>> +
> >>> +function ff_pix_norm1_neon, export=1
> >>> +// x0  const uint8_t *pix
> >>> +// x1  int line_size
> >>> +
> >>> +        sxtw            x1, w1
> >>> +        movi            v4.16b, #0
> >>> +        movi            v5.16b, #0
> >>> +        mov             w2, #16
> >>> +
> >>> +1:
> >>> +        ld1             {v1.16b}, [x0], x1
> >>> +        subs            w2, w2, #1
> >>> +        umull           v2.8h, v1.8b,  v1.8b
> >>> +        umull2          v3.8h, v1.16b, v1.16b
> >>> +        uadalp          v4.4s, v2.8h
> >>> +        uadalp          v5.4s, v3.8h
> >>
> >> From my earlier testing on A53, it seemed (surprisingly) to be equally
> >> fast to accumulate into the same register for both instructions - but I
> >> only tested that on A53. So we could change that here, getting rid of the
> >> add at the end (and one movi). Or if it does help on some other core,
> >> perhaps we should do the same for the function above too?
> >
> > Indeed, it is equally fast to accumulate into the same register on the
> > A55 and A76 as well.
> >
> > New patches attached (patch 3/7 has functional changes, but patch 4/7
> > only changes the commit message to reflect the new test run).
>
> LGTM very much now, thanks! And thanks for your patience through all the
> iterations on such trivial patches as these.

And thank you for your patience through the reviews :). I'm slowly
getting up to speed with aarch64 and neon.

I'll apply the pix_sum and pix_norm1 patches, and I'll wait a few days
for any comments on the draw_edges patches.
Ramiro Polla Aug. 26, 2024, 10:55 a.m. UTC | #4
On Thu, Aug 22, 2024 at 1:29 PM Ramiro Polla <ramiro.polla@gmail.com> wrote:
> On Wed, Aug 21, 2024 at 9:44 PM Martin Storsjö <martin@martin.st> wrote:
> > On Wed, 21 Aug 2024, Ramiro Polla wrote:
> > >> BTW, this instruction is kinda exotic and the docs aren't super clear, so
> > >> it'd be good to test manually that it really does what we want, for
> > >> negative numbers and numbers close to the ends of the value range; I
> > >> didn't do that manually yet.
> > >
> > > I prefer just sticking to sxtw + lsl then. When we move to ptrdiff_t
> > > the sxtw will be gone anyway.
> >
> > This sounds like a very reasonable choice indeed, especially if it's
> > somewhat plausible that we'll get rid of it at some point in the future.
> >
> > >>> +        movi            v0.16b, #0
> > >>> +        mov             w3, #16
> > >>> +
> > >>> +1:
> > >>> +        ld1             {v1.16b}, [x0], x1
> > >>> +        ld1             {v2.16b}, [x2], x1
> > >>> +        subs            w3, w3, #2
> > >>> +        uadalp          v0.8h, v1.16b
> > >>> +        uadalp          v0.8h, v2.16b
> > >>> +        b.ne            1b
> > >>> +
> > >>> +        uaddlv          s0, v0.8h
> > >>> +        fmov            w0, s0
> > >>> +
> > >>> +        ret
> > >>> +endfunc
> > >>> +
> > >>> +function ff_pix_norm1_neon, export=1
> > >>> +// x0  const uint8_t *pix
> > >>> +// x1  int line_size
> > >>> +
> > >>> +        sxtw            x1, w1
> > >>> +        movi            v4.16b, #0
> > >>> +        movi            v5.16b, #0
> > >>> +        mov             w2, #16
> > >>> +
> > >>> +1:
> > >>> +        ld1             {v1.16b}, [x0], x1
> > >>> +        subs            w2, w2, #1
> > >>> +        umull           v2.8h, v1.8b,  v1.8b
> > >>> +        umull2          v3.8h, v1.16b, v1.16b
> > >>> +        uadalp          v4.4s, v2.8h
> > >>> +        uadalp          v5.4s, v3.8h
> > >>
> > >> From my earlier testing on A53, it seemed (surprisingly) to be equally
> > >> fast to accumulate into the same register for both instructions - but I
> > >> only tested that on A53. So we could change that here, getting rid of the
> > >> add at the end (and one movi). Or if it does help on some other core,
> > >> perhaps we should do the same for the function above too?
> > >
> > > Indeed, it is equally fast to accumulate into the same register on the
> > > A55 and A76 as well.
> > >
> > > New patches attached (patch 3/7 has functional changes, but patch 4/7
> > > only changes the commit message to reflect the new test run).
> >
> > LGTM very much now, thanks! And thanks for your patience through all the
> > iterations on such trivial patches as these.
>
> And thank you for your patience through the reviews :). I'm slowly
> getting up to speed with aarch64 and neon.
>
> I'll apply the pix_sum and pix_norm1 patches, and I'll wait a few days
> for any comments on the draw_edges patches.

Applied.
diff mbox series

Patch

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index a3256bb1cc..de0653ebbc 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -10,6 +10,7 @@  OBJS-$(CONFIG_HPELDSP)                  += aarch64/hpeldsp_init_aarch64.o
 OBJS-$(CONFIG_IDCTDSP)                  += aarch64/idctdsp_init_aarch64.o
 OBJS-$(CONFIG_ME_CMP)                   += aarch64/me_cmp_init_aarch64.o
 OBJS-$(CONFIG_MPEGAUDIODSP)             += aarch64/mpegaudiodsp_init.o
+OBJS-$(CONFIG_MPEGVIDEOENC)             += aarch64/mpegvideoencdsp_init.o
 OBJS-$(CONFIG_NEON_CLOBBER_TEST)        += aarch64/neontest.o
 OBJS-$(CONFIG_PIXBLOCKDSP)              += aarch64/pixblockdsp_init_aarch64.o
 OBJS-$(CONFIG_VIDEODSP)                 += aarch64/videodsp_init.o
@@ -51,6 +52,7 @@  NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/idctdsp_neon.o              \
                                            aarch64/simple_idct_neon.o
 NEON-OBJS-$(CONFIG_ME_CMP)              += aarch64/me_cmp_neon.o
 NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
+NEON-OBJS-$(CONFIG_MPEGVIDEOENC)        += aarch64/mpegvideoencdsp_neon.o
 NEON-OBJS-$(CONFIG_PIXBLOCKDSP)         += aarch64/pixblockdsp_neon.o
 NEON-OBJS-$(CONFIG_VC1DSP)              += aarch64/vc1dsp_neon.o
 NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
diff --git a/libavcodec/aarch64/mpegvideoencdsp_init.c b/libavcodec/aarch64/mpegvideoencdsp_init.c
new file mode 100644
index 0000000000..7eb632ed1b
--- /dev/null
+++ b/libavcodec/aarch64/mpegvideoencdsp_init.c
@@ -0,0 +1,39 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/mpegvideoencdsp.h"
+#include "config.h"
+
+int ff_pix_sum16_neon(const uint8_t *pix, int line_size);
+int ff_pix_norm1_neon(const uint8_t *pix, int line_size);
+
+av_cold void ff_mpegvideoencdsp_init_aarch64(MpegvideoEncDSPContext *c,
+                                             AVCodecContext *avctx)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        c->pix_sum   = ff_pix_sum16_neon;
+        c->pix_norm1 = ff_pix_norm1_neon;
+    }
+}
diff --git a/libavcodec/aarch64/mpegvideoencdsp_neon.S b/libavcodec/aarch64/mpegvideoencdsp_neon.S
new file mode 100644
index 0000000000..6e7a9319ba
--- /dev/null
+++ b/libavcodec/aarch64/mpegvideoencdsp_neon.S
@@ -0,0 +1,69 @@ 
+/*
+ * Copyright (c) 2024 Ramiro Polla
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_pix_sum16_neon, export=1
+// x0  const uint8_t *pix
+// x1  int line_size
+
+        add             x2, x0, w1, sxtw
+        sbfiz           x1, x1, #1, #32
+        movi            v0.16b, #0
+        mov             w3, #16
+
+1:
+        ld1             {v1.16b}, [x0], x1
+        ld1             {v2.16b}, [x2], x1
+        subs            w3, w3, #2
+        uadalp          v0.8h, v1.16b
+        uadalp          v0.8h, v2.16b
+        b.ne            1b
+
+        uaddlv          s0, v0.8h
+        fmov            w0, s0
+
+        ret
+endfunc
+
+function ff_pix_norm1_neon, export=1
+// x0  const uint8_t *pix
+// x1  int line_size
+
+        sxtw            x1, w1
+        movi            v4.16b, #0
+        movi            v5.16b, #0
+        mov             w2, #16
+
+1:
+        ld1             {v1.16b}, [x0], x1
+        subs            w2, w2, #1
+        umull           v2.8h, v1.8b,  v1.8b
+        umull2          v3.8h, v1.16b, v1.16b
+        uadalp          v4.4s, v2.8h
+        uadalp          v5.4s, v3.8h
+        b.ne            1b
+
+        add             v0.4s, v4.4s, v5.4s
+        uaddlv          d0, v0.4s
+        fmov            w0, s0
+
+        ret
+endfunc
diff --git a/libavcodec/mpegvideoencdsp.c b/libavcodec/mpegvideoencdsp.c
index 9ccf1c302e..1091c94574 100644
--- a/libavcodec/mpegvideoencdsp.c
+++ b/libavcodec/mpegvideoencdsp.c
@@ -245,7 +245,9 @@  av_cold void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
 
     c->draw_edges = draw_edges_8_c;
 
-#if ARCH_ARM
+#if ARCH_AARCH64
+    ff_mpegvideoencdsp_init_aarch64(c, avctx);
+#elif ARCH_ARM
     ff_mpegvideoencdsp_init_arm(c, avctx);
 #elif ARCH_PPC
     ff_mpegvideoencdsp_init_ppc(c, avctx);
diff --git a/libavcodec/mpegvideoencdsp.h b/libavcodec/mpegvideoencdsp.h
index 3925d87dab..f437bc4e4e 100644
--- a/libavcodec/mpegvideoencdsp.h
+++ b/libavcodec/mpegvideoencdsp.h
@@ -46,6 +46,8 @@  typedef struct MpegvideoEncDSPContext {
 
 void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
                              AVCodecContext *avctx);
+void ff_mpegvideoencdsp_init_aarch64(MpegvideoEncDSPContext *c,
+                                     AVCodecContext *avctx);
 void ff_mpegvideoencdsp_init_arm(MpegvideoEncDSPContext *c,
                                  AVCodecContext *avctx);
 void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c,