diff mbox series

[FFmpeg-devel,v2,1/1] lavc/aarch64: add some neon pix_abs functions

Message ID 50530740b25747fbbfd138adabdc4a8f@EX13D07UWB004.ant.amazon.com
State Superseded
Headers show
Series lavc/aarch64: add some neon pix_abs functions | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
andriy/make_armv7_RPi4 success Make finished
andriy/make_fate_armv7_RPi4 success Make fate finished

Commit Message

Swinney, Jonathan April 14, 2022, 4:22 p.m. UTC
- ff_pix_abs16_neon
 - ff_pix_abs16_xy2_neon

In direct micro benchmarks of these ff functions verses their C implementations,
these functions performed as follows on AWS Graviton 2:

ff_pix_abs16_neon:
c:  benchmark ran 100000 iterations in 0.955383 seconds
ff: benchmark ran 100000 iterations in 0.097669 seconds

ff_pix_abs16_xy2_neon:
c:  benchmark ran 100000 iterations in 1.916759 seconds
ff: benchmark ran 100000 iterations in 0.370729 seconds

Signed-off-by: Jonathan Swinney <jswinney@amazon.com>
---
 libavcodec/aarch64/Makefile              |   2 +
 libavcodec/aarch64/me_cmp_init_aarch64.c |  39 +++++
 libavcodec/aarch64/me_cmp_neon.S         | 209 +++++++++++++++++++++++
 libavcodec/me_cmp.c                      |   2 +
 libavcodec/me_cmp.h                      |   1 +
 libavcodec/x86/me_cmp.asm                |   7 +
 libavcodec/x86/me_cmp_init.c             |   3 +
 tests/checkasm/Makefile                  |   2 +-
 tests/checkasm/checkasm.c                |   1 +
 tests/checkasm/checkasm.h                |   1 +
 tests/checkasm/motion.c                  | 155 +++++++++++++++++
 11 files changed, 421 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/aarch64/me_cmp_init_aarch64.c
 create mode 100644 libavcodec/aarch64/me_cmp_neon.S
 create mode 100644 tests/checkasm/motion.c

Comments

Michael Niedermayer April 15, 2022, 4:43 p.m. UTC | #1
On Thu, Apr 14, 2022 at 04:22:58PM +0000, Swinney, Jonathan wrote:
>  - ff_pix_abs16_neon
>  - ff_pix_abs16_xy2_neon
> 
> In direct micro benchmarks of these ff functions verses their C implementations,
> these functions performed as follows on AWS Graviton 2:
> 
> ff_pix_abs16_neon:
> c:  benchmark ran 100000 iterations in 0.955383 seconds
> ff: benchmark ran 100000 iterations in 0.097669 seconds
> 
> ff_pix_abs16_xy2_neon:
> c:  benchmark ran 100000 iterations in 1.916759 seconds
> ff: benchmark ran 100000 iterations in 0.370729 seconds
> 
> Signed-off-by: Jonathan Swinney <jswinney@amazon.com>
> ---
>  libavcodec/aarch64/Makefile              |   2 +
>  libavcodec/aarch64/me_cmp_init_aarch64.c |  39 +++++
>  libavcodec/aarch64/me_cmp_neon.S         | 209 +++++++++++++++++++++++
>  libavcodec/me_cmp.c                      |   2 +
>  libavcodec/me_cmp.h                      |   1 +
>  libavcodec/x86/me_cmp.asm                |   7 +
>  libavcodec/x86/me_cmp_init.c             |   3 +
>  tests/checkasm/Makefile                  |   2 +-
>  tests/checkasm/checkasm.c                |   1 +
>  tests/checkasm/checkasm.h                |   1 +
>  tests/checkasm/motion.c                  | 155 +++++++++++++++++
>  11 files changed, 421 insertions(+), 1 deletion(-)
>  create mode 100644 libavcodec/aarch64/me_cmp_init_aarch64.c
>  create mode 100644 libavcodec/aarch64/me_cmp_neon.S
>  create mode 100644 tests/checkasm/motion.c
> 
[...]
> diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
> index ad06d485ab..f73b9f9161 100644
> --- a/libavcodec/x86/me_cmp.asm
> +++ b/libavcodec/x86/me_cmp.asm
> @@ -255,6 +255,7 @@ hadamard8x8_diff %+ SUFFIX:
>  
>      HSUM                         m0, m1, eax
>      and                         rax, 0xFFFF
> +    emms
>      ret
>  
>  hadamard8_16_wrapper 0, 14
> @@ -345,6 +346,7 @@ cglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h
>  
>      HADDD     m7, m1
>      movd     eax, m7         ; return value
> +    emms
>      RET
>  %endmacro

on which arm chip did you test this ?


[...]
> diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
> index 9af911bb88..b330868a38 100644
> --- a/libavcodec/x86/me_cmp_init.c
> +++ b/libavcodec/x86/me_cmp_init.c
> @@ -186,6 +186,8 @@ static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
>          : "r" (stride), "m" (h)
>          : "%ecx");
>  
> +    emms_c();
> +
>      return tmp & 0xFFFF;
>  }
>  #undef SUM
> @@ -418,6 +420,7 @@ static inline int sum_mmx(void)
>          "paddw %%mm0, %%mm6             \n\t"
>          "movd %%mm6, %0                 \n\t"
>          : "=r" (ret));
> +    emms_c();
>      return ret & 0xFFFF;
>  }

hmmm

Also before the patch 
checkasm: all 6153 tests passed
after it
checkasm: all 3198 tests passed

thats on a x86-64

[...]
Martin Storsjö April 15, 2022, 9:13 p.m. UTC | #2
On Thu, 14 Apr 2022, Swinney, Jonathan wrote:

> - ff_pix_abs16_neon
> - ff_pix_abs16_xy2_neon
>
> In direct micro benchmarks of these ff functions verses their C implementations,
> these functions performed as follows on AWS Graviton 2:
>
> ff_pix_abs16_neon:
> c:  benchmark ran 100000 iterations in 0.955383 seconds
> ff: benchmark ran 100000 iterations in 0.097669 seconds
>
> ff_pix_abs16_xy2_neon:
> c:  benchmark ran 100000 iterations in 1.916759 seconds
> ff: benchmark ran 100000 iterations in 0.370729 seconds

It's generally preferred to include the numbers from checkasm --bench for 
these functions. You can execute it with e.g. "checkasm --bench=pix_fmt 
--test=motion" to run only the relevant tests and benchmark some specific 
function.


Also for the checkasm test; generally I'd suggest looking closer at some 
existing test as a good example. I think e.g. vp8dsp is a decent testcase 
to use as model.

> Signed-off-by: Jonathan Swinney <jswinney@amazon.com>
> ---
> libavcodec/aarch64/Makefile              |   2 +
> libavcodec/aarch64/me_cmp_init_aarch64.c |  39 +++++
> libavcodec/aarch64/me_cmp_neon.S         | 209 +++++++++++++++++++++++
> libavcodec/me_cmp.c                      |   2 +
> libavcodec/me_cmp.h                      |   1 +
> libavcodec/x86/me_cmp.asm                |   7 +
> libavcodec/x86/me_cmp_init.c             |   3 +
> tests/checkasm/Makefile                  |   2 +-
> tests/checkasm/checkasm.c                |   1 +
> tests/checkasm/checkasm.h                |   1 +
> tests/checkasm/motion.c                  | 155 +++++++++++++++++
> 11 files changed, 421 insertions(+), 1 deletion(-)
> create mode 100644 libavcodec/aarch64/me_cmp_init_aarch64.c
> create mode 100644 libavcodec/aarch64/me_cmp_neon.S
> create mode 100644 tests/checkasm/motion.c


> diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
> index 954461f81d..18869da1b4 100644
> --- a/libavcodec/aarch64/Makefile
> +++ b/libavcodec/aarch64/Makefile
> @@ -7,6 +7,7 @@ OBJS-$(CONFIG_H264PRED)                 += aarch64/h264pred_init.o
> OBJS-$(CONFIG_H264QPEL)                 += aarch64/h264qpel_init_aarch64.o
> OBJS-$(CONFIG_HPELDSP)                  += aarch64/hpeldsp_init_aarch64.o
> OBJS-$(CONFIG_IDCTDSP)                  += aarch64/idctdsp_init_aarch64.o
> +OBJS-$(CONFIG_ME_CMP)                   += aarch64/me_cmp_init_aarch64.o
> OBJS-$(CONFIG_MPEGAUDIODSP)             += aarch64/mpegaudiodsp_init.o
> OBJS-$(CONFIG_NEON_CLOBBER_TEST)        += aarch64/neontest.o
> OBJS-$(CONFIG_PIXBLOCKDSP)              += aarch64/pixblockdsp_init_aarch64.o

If this is gated behind a CONFIG_ME_CMP here, we should use the same 
CONFIG_ME_CMP for conditionals in checkasm too.

> +++ b/libavcodec/me_cmp.c
> @@ -1062,6 +1062,8 @@ av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
>
>     if (ARCH_ALPHA)
>         ff_me_cmp_init_alpha(c, avctx);
> +    if (ARCH_AARCH64)
> +        ff_me_cmp_init_aarch64(c, avctx);

Please add this in alphabetical order, aarch64 comes before alpha.

>     if (ARCH_ARM)
>         ff_me_cmp_init_arm(c, avctx);
>     if (ARCH_PPC)
> diff --git a/libavcodec/me_cmp.h b/libavcodec/me_cmp.h
> index e9b5161c9a..2c13bb9d3b 100644
> --- a/libavcodec/me_cmp.h
> +++ b/libavcodec/me_cmp.h
> @@ -81,6 +81,7 @@ typedef struct MECmpContext {
>
> void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx);
> void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx);
> +void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx);
> void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);

Ditto about alphabetical order

> void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx);
> void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx);
> diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
> index ad06d485ab..f73b9f9161 100644
> --- a/libavcodec/x86/me_cmp.asm
> +++ b/libavcodec/x86/me_cmp.asm
> @@ -255,6 +255,7 @@ hadamard8x8_diff %+ SUFFIX:
>
>     HSUM                         m0, m1, eax
>     and                         rax, 0xFFFF
> +    emms
>     ret
>

I think we shouldn't be changing the existing x86 functions here. Let's 
originally assume that the existing x86 functions are correct - they're 
expected to not call emms (as the code expects that to be done at a higher 
level somewhere). Therefore, the new checkasm test needs to check the emms 
handling in a way which acecpts the current x86 code. Lots of checkasm 
tests uses "declare_func_emms(AV_CPU_FLAG_MMX, ..." which I think implies 
this intent.

> diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
> index f768b1144e..f542ce0768 100644
> --- a/tests/checkasm/Makefile
> +++ b/tests/checkasm/Makefile
> @@ -30,7 +30,7 @@ AVCODECOBJS-$(CONFIG_V210_DECODER)      += v210dec.o
> AVCODECOBJS-$(CONFIG_V210_ENCODER)      += v210enc.o
> AVCODECOBJS-$(CONFIG_VP9_DECODER)       += vp9dsp.o
>
> -CHECKASMOBJS-$(CONFIG_AVCODEC)          += $(AVCODECOBJS-yes)
> +CHECKASMOBJS-$(CONFIG_AVCODEC)          += $(AVCODECOBJS-yes) motion.o
>

I guess this should use CONFIG_ME_CMP?

> # libavfilter tests
> AVFILTEROBJS-$(CONFIG_AFIR_FILTER) += af_afir.o
> diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
> index f74125e810..bbfc38636c 100644
> --- a/tests/checkasm/checkasm.c
> +++ b/tests/checkasm/checkasm.c
> @@ -155,6 +155,7 @@ static const struct {
>     #if CONFIG_VIDEODSP
>         { "videodsp", checkasm_check_videodsp },
>     #endif
> +        { "motion", checkasm_check_motion },

Ditto about a CONFIG_ME_CMP condition?

> diff --git a/tests/checkasm/motion.c b/tests/checkasm/motion.c
> new file mode 100644
> index 0000000000..9191a35c01
> --- /dev/null
> +++ b/tests/checkasm/motion.c
> @@ -0,0 +1,155 @@
> +/*
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> + */
> +
> +#include <string.h>
> +
> +#include "libavutil/common.h"
> +#include "libavutil/intreadwrite.h"
> +#include "libavutil/mem_internal.h"
> +
> +#include "libavcodec/me_cmp.h"
> +#include "libavutil/cpu.h"
> +
> +#include "checkasm.h"
> +
> +int dummy;

This looks unused?

> +
> +#define WIDTH 64
> +#define HEIGHT 64
> +
> +static uint8_t img1[WIDTH * HEIGHT];
> +static uint8_t img2[WIDTH * HEIGHT];

These are usually stack allocated. See e.g. vp8dsp.c - they are also 
normally allocated aligned, e.g. LOCAL_ALIGNED_16(...).

> +
> +
> +static void fill_random(uint8_t *tab, int size)
> +{
> +    int i;
> +    AVLFG prng;
> +
> +    av_lfg_init(&prng, 1);

Don't use your own PRNG here, use the rnd() macro from checkasm.h. This is 
hooked up to the test seed that is printed when checkasm is started, which 
allows covering different test combinations each time the test is executed 
(and can be replayed by rerunning checkasm with --seed=1234).

Overall, check other examples, e.g. vp8dsp.c, for use of the rnd() macro.

> +    for(i=0;i<size;i++) {

Please add spaces around operators.

> +        tab[i] = av_lfg_get(&prng) % 256;
> +    }
> +}
> +
> +static void test_motion(const char *name,
> +                 me_cmp_func test_func, me_cmp_func ref_func)
> +{
> +    int x, y, d1, d2, it;
> +    uint8_t *ptr;
> +
> +declare_func(int, struct MpegEncContext *c,
> +             uint8_t *blk1 /* align width (8 or 16) */,
> +             uint8_t *blk2 /* align 1 */, ptrdiff_t stride,
> +             int h);

Maybe declare_func_emms would avoid the need for the emms changes.

> +
> +    if (test_func == ref_func || test_func == NULL || ref_func == NULL) {
> +        return;
> +    }
> +
> +    /* test correctness */
> +    for(it=0;it<20;it++) {
> +
> +        fill_random(img1, WIDTH * HEIGHT);
> +        fill_random(img2, WIDTH * HEIGHT);

Here, fill_random reruns things deterministically as it always reinits the 
PRNG in the same way, so running 20 iterations doesn't increase test 
coverage. With proper use of rnd() it could work though.

> +
> +        if (check_func(test_func, "%s", name)) {

Don't rerun check_func() many times when testing multiple iterations; use 
check_func() once outermost, and while testing that specific function, do 
as many iterations as are relevant

> +            for(y=0;y<HEIGHT-17;y++) {
> +                for(x=0;x<WIDTH-17;x++) {

This is indeed a very exhaustive test, that's great. But if we already 
have such an exhaustive test we probably don't need to do 20 iterations of 
it. If the test instead only tests a couple randomly chosen dimensions, 
doing e.g. 20 iterations sounds like a good idea.

> +                    ptr = img2 + y * WIDTH + x;
> +                    d2 = call_ref(NULL, img1, ptr, WIDTH, 8);
> +                    d1 = call_new(NULL, img1, ptr, WIDTH, 8);
> +
> +                    if (d1 != d2) {
> +                        fail();
> +                        printf("error: mmx=%d c=%d\n", d1, d2);
> +                    }
> +                    bench_new(NULL, img1, ptr, WIDTH, 8);

If doing multiple iterations of the same, I would suggest not running 
bench_new each of them; normally you'd have exhaustive testing using 
call_ref/call_new and checking their outputs, and then just a couple calls 
with bench_new to benchmark things. (In this setup, the benchmark score 
ends up an average of all input size combinations. In some cases, the 
benchmark is only done on the biggest dimension, or on a couple relevant 
cases.)

> +                }
> +            }
> +        }
> +    }
> +    emms_c();
> +}
> +
> +#define sizeof_array(ar) (sizeof(ar)/sizeof((ar)[0]))

Use FF_ARRAY_ELEMS

> +
> +#define ME_CMP_1D_ARRAYS(XX)                                                   \
> +    XX(sad)                                                                    \
> +    XX(sse)                                                                    \
> +    XX(hadamard8_diff)                                                         \
> +    XX(dct_sad)                                                                \
> +    XX(quant_psnr)                                                             \
> +    XX(bit)                                                                    \
> +    XX(rd)                                                                     \
> +    XX(vsad)                                                                   \
> +    XX(vsse)                                                                   \
> +    XX(nsse)                                                                   \
> +    XX(w53)                                                                    \
> +    XX(w97)                                                                    \
> +    XX(dct_max)                                                                \
> +    XX(dct264_sad)                                                             \
> +    XX(me_pre_cmp)                                                             \
> +    XX(me_cmp)                                                                 \
> +    XX(me_sub_cmp)                                                             \
> +    XX(mb_cmp)                                                                 \
> +    XX(ildct_cmp)                                                              \
> +    XX(frame_skip_cmp)                                                         \
> +    XX(median_sad)
> +
> +
> +static void check_motion(void)
> +{
> +    char buf[64];
> +    AVCodecContext *ctx;
> +    MECmpContext c_ctx, ff_ctx;
> +
> +    memset(&c_ctx, 0, sizeof(c_ctx));
> +    memset(&ff_ctx, 0, sizeof(ff_ctx));
> +
> +    /* allocate AVCodecContext */
> +    ctx = avcodec_alloc_context3(NULL);
> +    ctx->flags |= AV_CODEC_FLAG_BITEXACT;
> +    /* clear cpu flags to get C versions of functions */
> +    ff_me_cmp_init(&ff_ctx, ctx);
> +    av_force_cpu_flags(0);
> +    ff_me_cmp_init(&c_ctx, ctx);

No, this isn't how you do it. A test shouldn't touch the cpu flags 
manually. (This probably is what causes the surprising difference in test 
counts that Michael noticed.)

On a high level, the checkasm test framework runs your test multiple 
times, with the cpu mask set to all intermediate levels. So first your 
test gets the C-only version of the function. On the next time around, it 
gets e.g. the MMX version of the function (if any). Then later it gets an 
SSE2, SSSE3, etc version of the function.

The check_func() macro does the magic - it looks up (using the string key) 
the previous function implementation for the same key, so that that 
function gets used as reference. So within your test you should only init 
your DSP functions once, using the cpu feature mask that the test 
framework sets up for you.

> +
> +    for (int i = 0; i < sizeof_array(c_ctx.pix_abs); i++) {
> +        for (int j = 0; j < sizeof_array(c_ctx.pix_abs[0]); j++) {
> +            snprintf(buf, sizeof(buf), "pix_abs_%d_%d", i, j);
> +            test_motion(buf, ff_ctx.pix_abs[i][j], c_ctx.pix_abs[i][j]);
> +        }
> +    }
> +
> +#define XX(me_cmp_array)                                                        \
> +    for (int i = 0; i < sizeof_array(c_ctx.me_cmp_array); i++) {                \
> +        snprintf(buf, sizeof(buf), #me_cmp_array "_%d", i);                     \
> +        test_motion(buf, ff_ctx.me_cmp_array[i], c_ctx.me_cmp_array[i]);        \
> +    }
> +    ME_CMP_1D_ARRAYS(XX)
> +#undef XX
> +
> +}
> +
> +void checkasm_check_motion(void)
> +{
> +    check_motion();
> +    report("motion");
> +}
> -- 
> 2.32.0

In addition to the test setup you've done, you also need to add the test 
to tests/fate/checkasm.mak, so that "make fate-checkasm" (and make fate) 
includes this new test.

// Martin
Swinney, Jonathan April 25, 2022, 10:43 p.m. UTC | #3
Thanks to Michael and Martin for you reviews on several of my patches. I've made many of the changes you have requested, but I'm not yet ready to resubmit the patches. I'll be out of the office until next week and I will submit updated versions then. Thanks!

-- 

Jonathan Swinney

On 4/15/22, 11:45 AM, "ffmpeg-devel on behalf of Michael Niedermayer" <ffmpeg-devel-bounces@ffmpeg.org on behalf of michael@niedermayer.cc> wrote:

    CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.



    On Thu, Apr 14, 2022 at 04:22:58PM +0000, Swinney, Jonathan wrote:
    >  - ff_pix_abs16_neon
    >  - ff_pix_abs16_xy2_neon
    >
    > In direct micro benchmarks of these ff functions verses their C implementations,
    > these functions performed as follows on AWS Graviton 2:
    >
    > ff_pix_abs16_neon:
    > c:  benchmark ran 100000 iterations in 0.955383 seconds
    > ff: benchmark ran 100000 iterations in 0.097669 seconds
    >
    > ff_pix_abs16_xy2_neon:
    > c:  benchmark ran 100000 iterations in 1.916759 seconds
    > ff: benchmark ran 100000 iterations in 0.370729 seconds
    >
    > Signed-off-by: Jonathan Swinney <jswinney@amazon.com>
    > ---
    >  libavcodec/aarch64/Makefile              |   2 +
    >  libavcodec/aarch64/me_cmp_init_aarch64.c |  39 +++++
    >  libavcodec/aarch64/me_cmp_neon.S         | 209 +++++++++++++++++++++++
    >  libavcodec/me_cmp.c                      |   2 +
    >  libavcodec/me_cmp.h                      |   1 +
    >  libavcodec/x86/me_cmp.asm                |   7 +
    >  libavcodec/x86/me_cmp_init.c             |   3 +
    >  tests/checkasm/Makefile                  |   2 +-
    >  tests/checkasm/checkasm.c                |   1 +
    >  tests/checkasm/checkasm.h                |   1 +
    >  tests/checkasm/motion.c                  | 155 +++++++++++++++++
    >  11 files changed, 421 insertions(+), 1 deletion(-)
    >  create mode 100644 libavcodec/aarch64/me_cmp_init_aarch64.c
    >  create mode 100644 libavcodec/aarch64/me_cmp_neon.S
    >  create mode 100644 tests/checkasm/motion.c
    >
    [...]
    > diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
    > index ad06d485ab..f73b9f9161 100644
    > --- a/libavcodec/x86/me_cmp.asm
    > +++ b/libavcodec/x86/me_cmp.asm
    > @@ -255,6 +255,7 @@ hadamard8x8_diff %+ SUFFIX:
    >
    >      HSUM                         m0, m1, eax
    >      and                         rax, 0xFFFF
    > +    emms
    >      ret
    >
    >  hadamard8_16_wrapper 0, 14
    > @@ -345,6 +346,7 @@ cglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h
    >
    >      HADDD     m7, m1
    >      movd     eax, m7         ; return value
    > +    emms
    >      RET
    >  %endmacro

    on which arm chip did you test this ?


    [...]
    > diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
    > index 9af911bb88..b330868a38 100644
    > --- a/libavcodec/x86/me_cmp_init.c
    > +++ b/libavcodec/x86/me_cmp_init.c
    > @@ -186,6 +186,8 @@ static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
    >          : "r" (stride), "m" (h)
    >          : "%ecx");
    >
    > +    emms_c();
    > +
    >      return tmp & 0xFFFF;
    >  }
    >  #undef SUM
    > @@ -418,6 +420,7 @@ static inline int sum_mmx(void)
    >          "paddw %%mm0, %%mm6             \n\t"
    >          "movd %%mm6, %0                 \n\t"
    >          : "=r" (ret));
    > +    emms_c();
    >      return ret & 0xFFFF;
    >  }

    hmmm

    Also before the patch
    checkasm: all 6153 tests passed
    after it
    checkasm: all 3198 tests passed

    thats on a x86-64

    [...]

    --
    Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

    Complexity theory is the science of finding the exact solution to an
    approximation. Benchmarking OTOH is finding an approximation of the exact
diff mbox series

Patch

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 954461f81d..18869da1b4 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -7,6 +7,7 @@  OBJS-$(CONFIG_H264PRED)                 += aarch64/h264pred_init.o
 OBJS-$(CONFIG_H264QPEL)                 += aarch64/h264qpel_init_aarch64.o
 OBJS-$(CONFIG_HPELDSP)                  += aarch64/hpeldsp_init_aarch64.o
 OBJS-$(CONFIG_IDCTDSP)                  += aarch64/idctdsp_init_aarch64.o
+OBJS-$(CONFIG_ME_CMP)                   += aarch64/me_cmp_init_aarch64.o
 OBJS-$(CONFIG_MPEGAUDIODSP)             += aarch64/mpegaudiodsp_init.o
 OBJS-$(CONFIG_NEON_CLOBBER_TEST)        += aarch64/neontest.o
 OBJS-$(CONFIG_PIXBLOCKDSP)              += aarch64/pixblockdsp_init_aarch64.o
@@ -46,6 +47,7 @@  NEON-OBJS-$(CONFIG_H264QPEL)            += aarch64/h264qpel_neon.o             \
 NEON-OBJS-$(CONFIG_HPELDSP)             += aarch64/hpeldsp_neon.o
 NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/simple_idct_neon.o
 NEON-OBJS-$(CONFIG_MDCT)                += aarch64/mdct_neon.o
+NEON-OBJS-$(CONFIG_ME_CMP)              += aarch64/me_cmp_neon.o
 NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
 NEON-OBJS-$(CONFIG_PIXBLOCKDSP)         += aarch64/pixblockdsp_neon.o
 NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
new file mode 100644
index 0000000000..9fb63e9973
--- /dev/null
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -0,0 +1,39 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/mpegvideo.h"
+
+int ff_pix_abs16_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
+                      ptrdiff_t stride, int h);
+int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
+                      ptrdiff_t stride, int h);
+
+av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        c->pix_abs[0][0] = ff_pix_abs16_neon;
+        c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
+    }
+}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
new file mode 100644
index 0000000000..3b48cb156d
--- /dev/null
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -0,0 +1,209 @@ 
+/*
+ * Copyright (c) 2022 Jonathan Swinney <jswinney@amazon.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_pix_abs16_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // w4           int h
+        // x5           uint8_t *pix3
+        cmp             w4, #4                      // if h < 4, jump to completion section
+        movi            v18.4S, #0                  // clear result accumulator
+        b.lt            2f
+1:
+        movi            v16.8h, #0                  // clear uabal accumulator
+        ld1             {v0.16b}, [x1], x3          // load pix1
+        ld1             {v4.16b}, [x2], x3          // load pix2
+        ld1             {v1.16b}, [x1], x3          // load pix1
+        ld1             {v5.16b}, [x2], x3          // load pix2
+        uabal           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
+        uabal2          v16.8h, v0.16b, v4.16b
+        ld1             {v2.16b}, [x1], x3          // load pix1
+        ld1             {v6.16b}, [x2], x3          // load pix2
+        uabal           v16.8h, v1.8b, v5.8b        // absolute difference accumulate
+        uabal2          v16.8h, v1.16b, v5.16b
+        ld1             {v3.16b}, [x1], x3
+        ld1             {v7.16b}, [x2], x3
+        uabal           v16.8h, v2.8b, v6.8b
+        uabal2          v16.8h, v2.16b, v6.16b
+        sub             w4, w4, #4                  // h -= 4
+        uabal           v16.8h, v3.8b, v7.8b
+        uabal2          v16.8h, v3.16b, v7.16b
+        cmp             w4, #4                      // if h >= 4, loop
+        uaddlv          s17, v16.8h                 // add up everything in v16 accumulator
+        add             d18, d17, d18               // add to the end result register
+
+        b.ge            1b
+        cbnz            w4, 2f                      // if iterations remain, jump to completion section
+
+        fmov            w0, s18                     // copy result to general purpose register
+        ret
+
+2:
+        movi            v16.8h, #0                  // clear the uabal accumulator
+        ld1             {v0.16b}, [x1], x3          // load pix1
+        ld1             {v4.16b}, [x2], x3          // load pix2
+        uabal           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
+        uabal2          v16.8h, v0.16b, v4.16b
+        addv            h17, v16.8h                 // add up v16
+        add             d18, d17, d18               // add to result
+        subs            w4, w4, #1                  // h -= 1
+        b.ne            2b
+
+        fmov            w0, s18                     // copy result to general purpose register
+        ret
+endfunc
+
+function ff_pix_abs16_xy2_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // w4           int h
+        // x5           uint8_t *pix3
+        add             x5, x2, x3                  // create a pointer for pix3
+        movi            v0.2d, #0                   // initialize the result register
+
+        // Load initial pix2 values for either the unrolled version or completion version.
+        ldr             q4, [x2, #1]                // load pix2+1
+        ldr             q3, [x2]                    // load pix2
+        uaddl           v2.8h, v4.8b, v3.8b         // pix2 + pix2+1 0..7
+        uaddl2          v3.8h, v4.16b, v3.16b       // pix2 + pix2+1 8..15
+        cmp             w4, #4                      // if h < 4 jump to the completion version
+        b.lt            2f
+1:
+        // This is an unrolled implemntation. It completes 4 iterations of the C for each branch.
+        // In each iteration, pix2[i+1] == pix3[i]. This means we need only three loads per iteration,
+        // plus two at the begining to start.
+        ldr             q5, [x5, #1]                // load pix3+1
+        ld1             {v4.16b}, [x5], x3          // load pix3
+        ld1             {v1.16b}, [x1], x3          // load pix1
+
+        ldr             q7, [x5, #1]                // load pix3+1
+        ld1             {v6.16b}, [x5], x3          // load pix3
+        ld1             {v16.16b}, [x1], x3         // load pix1
+
+        ldr             q19, [x5, #1]               // load pix3+1
+        ld1             {v18.16b}, [x5], x3         // load pix3
+        ld1             {v17.16b}, [x1], x3         // load pix1
+
+        ldr             q22, [x5, #1]               // load pix3+1
+        ld1             {v21.16b}, [x5], x3         // load pix3
+        ld1             {v20.16b}, [x1], x3         // load pix1
+
+        // These blocks compute the average: avg(pix2[n], pix2[n+1], pix3[n], pix3[n+1])
+        uaddl           v30.8h, v4.8b, v5.8b        // pix3 + pix3+1 0..7
+        uaddl2          v31.8h, v4.16b, v5.16b      // pix3 + pix3+1 8..15
+        add             v23.8h, v2.8h, v30.8h       // add up 0..7, using pix2 + pix2+1 values from previous iteration
+        add             v24.8h, v3.8h, v31.8h       // add up 8..15, using pix2 + pix2+1 values from previous iteration
+        urshr           v23.8h, v23.8h, #2          // shift right 2 0..7 (rounding shift right)
+        urshr           v24.8h, v24.8h, #2          // shift right 2 8..15
+
+        uaddl           v2.8h, v6.8b, v7.8b         // pix3 + pix3+1 0..7
+        uaddl2          v3.8h, v6.16b, v7.16b       // pix3 + pix3+1 8..15
+        add             v26.8h, v30.8h, v2.8h       // add up 0..7, using pix2 + pix2+1 values from pix3 above
+        add             v27.8h, v31.8h, v3.8h       // add up 8..15, using pix2 + pix2+1 values from pix3 above
+        urshr           v26.8h, v26.8h, #2          // shift right 2 0..7 (rounding shift right)
+        urshr           v27.8h, v27.8h, #2          // shift right 2 8..15
+
+        uaddl           v4.8h, v18.8b, v19.8b       // pix3 + pix3+1 0..7
+        uaddl2          v5.8h, v18.16b, v19.16b     // pix3 + pix3+1 8..15
+        add             v28.8h, v2.8h, v4.8h        // add up 0..7, using pix2 + pix2+1 values from pix3 above
+        add             v29.8h, v3.8h, v5.8h        // add up 8..15, using pix2 + pix2+1 values from pix3 above
+        urshr           v28.8h, v28.8h, #2          // shift right 2 0..7 (rounding shift right)
+        urshr           v29.8h, v29.8h, #2          // shift right 2 8..15
+
+        uaddl           v2.8h, v21.8b, v22.8b       // pix3 + pix3+1 0..7
+        uaddl2          v3.8h, v21.16b, v22.16b     // pix3 + pix3+1 8..15
+        add             v30.8h, v4.8h, v2.8h        // add up 0..7, using pix2 + pix2+1 values from pix3 above
+        add             v31.8h, v5.8h, v3.8h        // add up 8..15, using pix2 + pix2+1 values from pix3 above
+        urshr           v30.8h, v30.8h, #2          // shift right 2 0..7 (rounding shift right)
+        urshr           v31.8h, v31.8h, #2          // shift right 2 8..15
+
+        // Averages are now stored in these registers:
+        // v23, v24
+        // v26, v27
+        // v28, v29
+        // v30, v31
+        // pix1 values in these registers:
+        // v1, v16, v17, v20
+        // available
+        // v4, v5, v7, v16, v18, v19, v25
+
+        uxtl2           v4.8h, v1.16b               // 8->16 bits pix1 8..15
+        uxtl            v1.8h, v1.8b                // 8->16 bits pix1 0..7
+        uxtl2           v7.8h, v16.16b              // 8->16 bits pix1 8..15
+        uxtl            v6.8h, v16.8b               // 8->16 bits pix1 0..7
+        uxtl2           v18.8h, v17.16b             // 8->16 bits pix1 8..15
+        uxtl            v17.8h, v17.8b              // 8->16 bits pix1 0..7
+        uxtl2           v25.8h, v20.16b             // 8->16 bits pix1 8..15
+        uxtl            v20.8h, v20.8b              // 8->16 bits pix1 0..7
+
+        uabd            v5.8h, v1.8h, v23.8h        // absolute difference 0..7
+        uaba            v5.8h, v4.8h, v24.8h        // absolute difference accumulate 8..15
+        uaba            v5.8h, v6.8h, v26.8h        // absolute difference accumulate 0..7
+        uaba            v5.8h, v7.8h, v27.8h        // absolute difference accumulate 8..15
+        uaba            v5.8h, v17.8h, v28.8h       // absolute difference accumulate 0..7
+        uaba            v5.8h, v18.8h, v29.8h       // absolute difference accumulate 8..15
+        uaba            v5.8h, v20.8h, v30.8h       // absolute difference accumulate 0..7
+        uaba            v5.8h, v25.8h, v31.8h       // absolute difference accumulate 8..15
+
+        uaddlv          s5, v5.8h                   // add up accumulated values
+        sub             w4, w4, #4                  // h -= 4
+        add             d0, d0, d5                  // add to final result
+        cmp             w4, #4                      // loop if h >= 4
+        b.ge            1b
+        cbnz            w4, 2f                      // if iterations remain jump to completion section
+
+        fmov            w0, s0                      // copy result to general purpose register
+        ret
+2:
+        // v2 and v3 are set either at the end of this loop or at from the unrolled version
+        // which branches here to complete iterations when h % 4 != 0.
+        ldr             q5, [x5, #1]                // load pix3+1
+        ld1             {v4.16b}, [x5], x3          // load pix3
+        ld1             {v1.16b}, [x1], x3          // load pix1
+        sub             w4, w4, #1                  // decrement h
+
+        uaddl           v18.8h, v4.8b, v5.8b        // pix3 + pix3+1 0..7
+        uaddl2          v19.8h, v4.16b, v5.16b      // pix3 + pix3+1 8..15
+        add             v16.8h, v2.8h, v18.8h       // add up 0..7, using pix2 + pix2+1 values from previous iteration
+        add             v17.8h, v3.8h, v19.8h       // add up 8..15, using pix2 + pix2+1 values from previous iteration
+        // divide by 4 to compute the average of values summed above
+        urshr           v16.8h, v16.8h, #2          // shift right by 2 0..7 (rounding shift right)
+        urshr           v17.8h, v17.8h, #2          // shift right by 2 8..15
+
+        uxtl2           v8.8h, v1.16b               // 8->16 bits pix1 8..15
+        uxtl            v1.8h, v1.8b                // 8->16 bits pix1 0..7
+
+        uabd            v6.8h, v1.8h, v16.8h        // absolute difference 0..7
+        uaba            v6.8h, v8.8h, v17.8h        // absolute difference accumulate 8..15
+        mov             v2.16b, v18.16b             // pix3 -> pix2
+        mov             v3.16b, v19.16b             // pix3+1 -> pix2+1
+        addv            h6, v6.8h                   // add up accumulator in v6
+        add             d0, d0, d6                  // add to the final result
+
+        cbnz            w4, 2b                      // loop if h > 0
+        fmov            w0, s0                      // copy result to general purpose register
+        ret
+endfunc
diff --git a/libavcodec/me_cmp.c b/libavcodec/me_cmp.c
index b2f87d2e1b..2bda7c030c 100644
--- a/libavcodec/me_cmp.c
+++ b/libavcodec/me_cmp.c
@@ -1062,6 +1062,8 @@  av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
 
     if (ARCH_ALPHA)
         ff_me_cmp_init_alpha(c, avctx);
+    if (ARCH_AARCH64)
+        ff_me_cmp_init_aarch64(c, avctx);
     if (ARCH_ARM)
         ff_me_cmp_init_arm(c, avctx);
     if (ARCH_PPC)
diff --git a/libavcodec/me_cmp.h b/libavcodec/me_cmp.h
index e9b5161c9a..2c13bb9d3b 100644
--- a/libavcodec/me_cmp.h
+++ b/libavcodec/me_cmp.h
@@ -81,6 +81,7 @@  typedef struct MECmpContext {
 
 void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx);
+void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx);
diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index ad06d485ab..f73b9f9161 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -255,6 +255,7 @@  hadamard8x8_diff %+ SUFFIX:
 
     HSUM                         m0, m1, eax
     and                         rax, 0xFFFF
+    emms
     ret
 
 hadamard8_16_wrapper 0, 14
@@ -345,6 +346,7 @@  cglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h
 
     HADDD     m7, m1
     movd     eax, m7         ; return value
+    emms
     RET
 %endmacro
 
@@ -463,6 +465,7 @@  cglobal hf_noise%1, 3,3,0, pix1, lsize, h
     psrlq      m6, 32
     paddd      m0, m6
     movd      eax, m0   ; eax = result of hf_noise8;
+    emms
     REP_RET                 ; return eax;
 %endmacro
 
@@ -516,6 +519,7 @@  align 16
     paddw     m2, m0
 %endif
     movd     eax, m2
+    emms
     RET
 %endmacro
 
@@ -593,6 +597,7 @@  align 16
     paddw     m0, m1
 %endif
     movd     eax, m0
+    emms
     RET
 %endmacro
 
@@ -663,6 +668,7 @@  align 16
     paddw     m0, m1
 %endif
     movd     eax, m0
+    emms
     RET
 %endmacro
 
@@ -825,6 +831,7 @@  cglobal vsad_intra%1, 5, 5, 3, v, pix1, pix2, lsize, h
     paddd  m0, m1
 %endif
     movd eax, m0
+    emms
     RET
 %endmacro
 
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index 9af911bb88..b330868a38 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -186,6 +186,8 @@  static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
         : "r" (stride), "m" (h)
         : "%ecx");
 
+    emms_c();
+
     return tmp & 0xFFFF;
 }
 #undef SUM
@@ -418,6 +420,7 @@  static inline int sum_mmx(void)
         "paddw %%mm0, %%mm6             \n\t"
         "movd %%mm6, %0                 \n\t"
         : "=r" (ret));
+    emms_c();
     return ret & 0xFFFF;
 }
 
diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index f768b1144e..f542ce0768 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -30,7 +30,7 @@  AVCODECOBJS-$(CONFIG_V210_DECODER)      += v210dec.o
 AVCODECOBJS-$(CONFIG_V210_ENCODER)      += v210enc.o
 AVCODECOBJS-$(CONFIG_VP9_DECODER)       += vp9dsp.o
 
-CHECKASMOBJS-$(CONFIG_AVCODEC)          += $(AVCODECOBJS-yes)
+CHECKASMOBJS-$(CONFIG_AVCODEC)          += $(AVCODECOBJS-yes) motion.o
 
 # libavfilter tests
 AVFILTEROBJS-$(CONFIG_AFIR_FILTER) += af_afir.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index f74125e810..bbfc38636c 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -155,6 +155,7 @@  static const struct {
     #if CONFIG_VIDEODSP
         { "videodsp", checkasm_check_videodsp },
     #endif
+        { "motion", checkasm_check_motion },
 #endif
 #if CONFIG_AVFILTER
     #if CONFIG_AFIR_FILTER
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index c3192d8c23..1269ab7cc0 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -67,6 +67,7 @@  void checkasm_check_huffyuvdsp(void);
 void checkasm_check_jpeg2000dsp(void);
 void checkasm_check_llviddsp(void);
 void checkasm_check_llviddspenc(void);
+void checkasm_check_motion(void);
 void checkasm_check_nlmeans(void);
 void checkasm_check_opusdsp(void);
 void checkasm_check_pixblockdsp(void);
diff --git a/tests/checkasm/motion.c b/tests/checkasm/motion.c
new file mode 100644
index 0000000000..9191a35c01
--- /dev/null
+++ b/tests/checkasm/motion.c
@@ -0,0 +1,155 @@ 
+/*
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+
+#include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+#include "libavcodec/me_cmp.h"
+#include "libavutil/cpu.h"
+
+#include "checkasm.h"
+
+int dummy;
+
+#define WIDTH 64
+#define HEIGHT 64
+
+static uint8_t img1[WIDTH * HEIGHT];
+static uint8_t img2[WIDTH * HEIGHT];
+
+
+static void fill_random(uint8_t *tab, int size)
+{
+    int i;
+    AVLFG prng;
+
+    av_lfg_init(&prng, 1);
+    for(i=0;i<size;i++) {
+        tab[i] = av_lfg_get(&prng) % 256;
+    }
+}
+
+static void test_motion(const char *name,
+                 me_cmp_func test_func, me_cmp_func ref_func)
+{
+    int x, y, d1, d2, it;
+    uint8_t *ptr;
+
+declare_func(int, struct MpegEncContext *c,
+             uint8_t *blk1 /* align width (8 or 16) */,
+             uint8_t *blk2 /* align 1 */, ptrdiff_t stride,
+             int h);
+
+    if (test_func == ref_func || test_func == NULL || ref_func == NULL) {
+        return;
+    }
+
+    /* test correctness */
+    for(it=0;it<20;it++) {
+
+        fill_random(img1, WIDTH * HEIGHT);
+        fill_random(img2, WIDTH * HEIGHT);
+
+        if (check_func(test_func, "%s", name)) {
+            for(y=0;y<HEIGHT-17;y++) {
+                for(x=0;x<WIDTH-17;x++) {
+                    ptr = img2 + y * WIDTH + x;
+                    d2 = call_ref(NULL, img1, ptr, WIDTH, 8);
+                    d1 = call_new(NULL, img1, ptr, WIDTH, 8);
+
+                    if (d1 != d2) {
+                        fail();
+                        printf("error: mmx=%d c=%d\n", d1, d2);
+                    }
+                    bench_new(NULL, img1, ptr, WIDTH, 8);
+                }
+            }
+        }
+    }
+    emms_c();
+}
+
+#define sizeof_array(ar) (sizeof(ar)/sizeof((ar)[0]))
+
+#define ME_CMP_1D_ARRAYS(XX)                                                   \
+    XX(sad)                                                                    \
+    XX(sse)                                                                    \
+    XX(hadamard8_diff)                                                         \
+    XX(dct_sad)                                                                \
+    XX(quant_psnr)                                                             \
+    XX(bit)                                                                    \
+    XX(rd)                                                                     \
+    XX(vsad)                                                                   \
+    XX(vsse)                                                                   \
+    XX(nsse)                                                                   \
+    XX(w53)                                                                    \
+    XX(w97)                                                                    \
+    XX(dct_max)                                                                \
+    XX(dct264_sad)                                                             \
+    XX(me_pre_cmp)                                                             \
+    XX(me_cmp)                                                                 \
+    XX(me_sub_cmp)                                                             \
+    XX(mb_cmp)                                                                 \
+    XX(ildct_cmp)                                                              \
+    XX(frame_skip_cmp)                                                         \
+    XX(median_sad)
+
+
+static void check_motion(void)
+{
+    char buf[64];
+    AVCodecContext *ctx;
+    MECmpContext c_ctx, ff_ctx;
+
+    memset(&c_ctx, 0, sizeof(c_ctx));
+    memset(&ff_ctx, 0, sizeof(ff_ctx));
+
+    /* allocate AVCodecContext */
+    ctx = avcodec_alloc_context3(NULL);
+    ctx->flags |= AV_CODEC_FLAG_BITEXACT;
+    /* clear cpu flags to get C versions of functions */
+    ff_me_cmp_init(&ff_ctx, ctx);
+    av_force_cpu_flags(0);
+    ff_me_cmp_init(&c_ctx, ctx);
+
+    for (int i = 0; i < sizeof_array(c_ctx.pix_abs); i++) {
+        for (int j = 0; j < sizeof_array(c_ctx.pix_abs[0]); j++) {
+            snprintf(buf, sizeof(buf), "pix_abs_%d_%d", i, j);
+            test_motion(buf, ff_ctx.pix_abs[i][j], c_ctx.pix_abs[i][j]);
+        }
+    }
+
+#define XX(me_cmp_array)                                                        \
+    for (int i = 0; i < sizeof_array(c_ctx.me_cmp_array); i++) {                \
+        snprintf(buf, sizeof(buf), #me_cmp_array "_%d", i);                     \
+        test_motion(buf, ff_ctx.me_cmp_array[i], c_ctx.me_cmp_array[i]);        \
+    }
+    ME_CMP_1D_ARRAYS(XX)
+#undef XX
+
+}
+
+void checkasm_check_motion(void)
+{
+    check_motion();
+    report("motion");
+}