diff mbox series

[FFmpeg-devel,v4,1/1] lavc/aarch64: motion estimation functions in neon

Message ID 82e0929f4af04100872873ce7dc41683@EX13D07UWB004.ant.amazon.com
State New
Headers show
Series [FFmpeg-devel,v4,1/1] lavc/aarch64: motion estimation functions in neon | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
andriy/make_armv7_RPi4 success Make finished
andriy/make_fate_armv7_RPi4 success Make fate finished

Commit Message

Swinney, Jonathan June 26, 2022, 8:58 p.m. UTC
- ff_pix_abs16_neon
 - ff_pix_abs16_xy2_neon

In direct micro benchmarks of these ff functions verses their C implementations,
these functions performed as follows on AWS Graviton 3.

ff_pix_abs16_neon:
pix_abs_0_0_c: 141.1
pix_abs_0_0_neon: 19.6

ff_pix_abs16_xy2_neon:
pix_abs_0_3_c: 269.1
pix_abs_0_3_neon: 39.3

Tested with:
./tests/checkasm/checkasm --test=motion --bench --disable-linux-perf

Signed-off-by: Jonathan Swinney <jswinney@amazon.com>
---
 libavcodec/aarch64/Makefile              |   2 +
 libavcodec/aarch64/me_cmp_init_aarch64.c |  39 +++++
 libavcodec/aarch64/me_cmp_neon.S         | 205 +++++++++++++++++++++++
 libavcodec/me_cmp.c                      |   4 +-
 libavcodec/me_cmp.h                      |   1 +
 tests/checkasm/Makefile                  |   1 +
 tests/checkasm/checkasm.c                |   3 +
 tests/checkasm/checkasm.h                |   1 +
 tests/checkasm/motion.c                  | 151 +++++++++++++++++
 tests/fate/checkasm.mak                  |   1 +
 10 files changed, 407 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/aarch64/me_cmp_init_aarch64.c
 create mode 100644 libavcodec/aarch64/me_cmp_neon.S
 create mode 100644 tests/checkasm/motion.c

Comments

Martin Storsjö June 27, 2022, 9:55 p.m. UTC | #1
On Sun, 26 Jun 2022, Swinney, Jonathan wrote:

> - ff_pix_abs16_neon
> - ff_pix_abs16_xy2_neon
>
> In direct micro benchmarks of these ff functions verses their C implementations,
> these functions performed as follows on AWS Graviton 3.
>
> ff_pix_abs16_neon:
> pix_abs_0_0_c: 141.1
> pix_abs_0_0_neon: 19.6
>
> ff_pix_abs16_xy2_neon:
> pix_abs_0_3_c: 269.1
> pix_abs_0_3_neon: 39.3
>
> Tested with:
> ./tests/checkasm/checkasm --test=motion --bench --disable-linux-perf
>
> Signed-off-by: Jonathan Swinney <jswinney@amazon.com>

Thanks! This looked great to me, and you seem to have applied all my 
feedback!

(There was one last case of "ldur" missing that I fixed up locally.)

With that fixed, I pushed this one now. Thanks!

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index c8935f205e..9ce21566c6 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -7,6 +7,7 @@  OBJS-$(CONFIG_H264PRED)                 += aarch64/h264pred_init.o
 OBJS-$(CONFIG_H264QPEL)                 += aarch64/h264qpel_init_aarch64.o
 OBJS-$(CONFIG_HPELDSP)                  += aarch64/hpeldsp_init_aarch64.o
 OBJS-$(CONFIG_IDCTDSP)                  += aarch64/idctdsp_init_aarch64.o
+OBJS-$(CONFIG_ME_CMP)                   += aarch64/me_cmp_init_aarch64.o
 OBJS-$(CONFIG_MPEGAUDIODSP)             += aarch64/mpegaudiodsp_init.o
 OBJS-$(CONFIG_NEON_CLOBBER_TEST)        += aarch64/neontest.o
 OBJS-$(CONFIG_PIXBLOCKDSP)              += aarch64/pixblockdsp_init_aarch64.o
@@ -47,6 +48,7 @@  NEON-OBJS-$(CONFIG_HPELDSP)             += aarch64/hpeldsp_neon.o
 NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/idctdsp_neon.o              \
                                            aarch64/simple_idct_neon.o
 NEON-OBJS-$(CONFIG_MDCT)                += aarch64/mdct_neon.o
+NEON-OBJS-$(CONFIG_ME_CMP)              += aarch64/me_cmp_neon.o
 NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
 NEON-OBJS-$(CONFIG_PIXBLOCKDSP)         += aarch64/pixblockdsp_neon.o
 NEON-OBJS-$(CONFIG_VC1DSP)              += aarch64/vc1dsp_neon.o
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
new file mode 100644
index 0000000000..9fb63e9973
--- /dev/null
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -0,0 +1,39 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/mpegvideo.h"
+
+int ff_pix_abs16_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
+                      ptrdiff_t stride, int h);
+int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
+                      ptrdiff_t stride, int h);
+
+av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        c->pix_abs[0][0] = ff_pix_abs16_neon;
+        c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
+    }
+}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
new file mode 100644
index 0000000000..f71e0eee5f
--- /dev/null
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -0,0 +1,205 @@ 
+/*
+ * Copyright (c) 2022 Jonathan Swinney <jswinney@amazon.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_pix_abs16_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // w4           int h
+        cmp             w4, #4                      // if h < 4, jump to completion section
+        movi            v18.4S, #0                  // clear result accumulator
+        b.lt            2f
+1:
+        ld1             {v0.16b}, [x1], x3          // load pix1
+        ld1             {v4.16b}, [x2], x3          // load pix2
+        ld1             {v1.16b}, [x1], x3          // load pix1
+        ld1             {v5.16b}, [x2], x3          // load pix2
+        uabdl           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
+        uabdl2          v17.8h, v0.16b, v4.16b
+        ld1             {v2.16b}, [x1], x3          // load pix1
+        ld1             {v6.16b}, [x2], x3          // load pix2
+        uabal           v16.8h, v1.8b, v5.8b        // absolute difference accumulate
+        uabal2          v17.8h, v1.16b, v5.16b
+        ld1             {v3.16b}, [x1], x3
+        ld1             {v7.16b}, [x2], x3
+        uabal           v16.8h, v2.8b, v6.8b
+        uabal2          v17.8h, v2.16b, v6.16b
+        sub             w4, w4, #4                  // h -= 4
+        uabal           v16.8h, v3.8b, v7.8b
+        uabal2          v17.8h, v3.16b, v7.16b
+        cmp             w4, #4                      // if h >= 4, loop
+        add             v16.8h, v16.8h, v17.8h
+        uaddlv          s16, v16.8h                 // add up everything in v16 accumulator
+        add             d18, d16, d18               // add to the end result register
+
+        b.ge            1b
+        cbnz            w4, 2f                      // if iterations remain, jump to completion section
+
+        fmov            w0, s18                     // copy result to general purpose register
+        ret
+
+2:
+        ld1             {v0.16b}, [x1], x3          // load pix1
+        ld1             {v4.16b}, [x2], x3          // load pix2
+        uabdl           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
+        uabal2          v16.8h, v0.16b, v4.16b
+        subs            w4, w4, #1                  // h -= 1
+        addv            h16, v16.8h                 // add up v16
+        add             d18, d16, d18               // add to result
+        b.ne            2b
+
+        fmov            w0, s18                     // copy result to general purpose register
+        ret
+endfunc
+
+function ff_pix_abs16_xy2_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // w4           int h
+
+        add             x5, x2, x3                  // use x5 to hold uint8_t *pix3
+        movi            v0.2d, #0                   // initialize the result register
+
+        // Load initial pix2 values for either the unrolled version or completion version.
+        ldur            q4, [x2, #1]                // load pix2+1
+        ldr             q3, [x2]                    // load pix2
+        uaddl           v2.8h, v4.8b, v3.8b         // pix2 + pix2+1 0..7
+        uaddl2          v3.8h, v4.16b, v3.16b       // pix2 + pix2+1 8..15
+        cmp             w4, #4                      // if h < 4 jump to the completion version
+        b.lt            2f
+1:
+        // This is an unrolled implementation. It completes 4 iterations of the C for each branch.
+        // In each iteration, pix2[i+1] == pix3[i]. This means we need only three loads per iteration,
+        // plus two at the beginning to start.
+        ldur            q5, [x5, #1]                // load pix3+1
+        ld1             {v4.16b}, [x5], x3          // load pix3
+        ld1             {v1.16b}, [x1], x3          // load pix1
+
+        ldur            q7, [x5, #1]                // load pix3+1
+        ld1             {v6.16b}, [x5], x3          // load pix3
+        ld1             {v16.16b}, [x1], x3         // load pix1
+
+        ldur            q19, [x5, #1]               // load pix3+1
+        ld1             {v18.16b}, [x5], x3         // load pix3
+        ld1             {v17.16b}, [x1], x3         // load pix1
+
+        ldur            q22, [x5, #1]               // load pix3+1
+        ld1             {v21.16b}, [x5], x3         // load pix3
+        ld1             {v20.16b}, [x1], x3         // load pix1
+
+        // These blocks compute the average: avg(pix2[n], pix2[n+1], pix3[n], pix3[n+1])
+        uaddl           v30.8h, v4.8b, v5.8b        // pix3 + pix3+1 0..7
+        uaddl2          v31.8h, v4.16b, v5.16b      // pix3 + pix3+1 8..15
+        add             v23.8h, v2.8h, v30.8h       // add up 0..7, using pix2 + pix2+1 values from previous iteration
+        add             v24.8h, v3.8h, v31.8h       // add up 8..15, using pix2 + pix2+1 values from previous iteration
+        rshrn           v23.8b, v23.8h, #2          // shift right 2 0..7 (rounding shift right)
+        rshrn2          v23.16b, v24.8h, #2         // shift right 2 8..15
+
+        uaddl           v2.8h, v6.8b, v7.8b         // pix3 + pix3+1 0..7
+        uaddl2          v3.8h, v6.16b, v7.16b       // pix3 + pix3+1 8..15
+        add             v26.8h, v30.8h, v2.8h       // add up 0..7, using pix2 + pix2+1 values from pix3 above
+        add             v27.8h, v31.8h, v3.8h       // add up 8..15, using pix2 + pix2+1 values from pix3 above
+        rshrn           v26.8b, v26.8h, #2          // shift right 2 0..7 (rounding shift right)
+        rshrn2          v26.16b, v27.8h, #2         // shift right 2 8..15
+
+        uaddl           v4.8h, v18.8b, v19.8b       // pix3 + pix3+1 0..7
+        uaddl2          v5.8h, v18.16b, v19.16b     // pix3 + pix3+1 8..15
+        add             v28.8h, v2.8h, v4.8h        // add up 0..7, using pix2 + pix2+1 values from pix3 above
+        add             v29.8h, v3.8h, v5.8h        // add up 8..15, using pix2 + pix2+1 values from pix3 above
+        rshrn           v28.8b, v28.8h, #2          // shift right 2 0..7 (rounding shift right)
+        rshrn2          v28.16b, v29.8h, #2         // shift right 2 8..15
+
+        uaddl           v2.8h, v21.8b, v22.8b       // pix3 + pix3+1 0..7
+        uaddl2          v3.8h, v21.16b, v22.16b     // pix3 + pix3+1 8..15
+        add             v30.8h, v4.8h, v2.8h        // add up 0..7, using pix2 + pix2+1 values from pix3 above
+        add             v31.8h, v5.8h, v3.8h        // add up 8..15, using pix2 + pix2+1 values from pix3 above
+        rshrn           v30.8b, v30.8h, #2          // shift right 2 0..7 (rounding shift right)
+        rshrn2          v30.16b, v31.8h, #2         // shift right 2 8..15
+
+        // Averages are now stored in these registers:
+        // v23, v16, v28, v30
+        // pix1 values in these registers:
+        // v1, v16, v17, v20
+        // available:
+        // v4, v5, v7, v18, v19, v24, v25, v27, v29, v31
+
+        sub             w4, w4, #4                  // h -= 4
+
+        // Using absolute-difference instructions instead of absolute-difference-accumulate allows
+        // us to keep the results in 16b vectors instead of widening values with twice the instructions.
+        // This approach also has fewer data dependencies, allowing better instruction level parallelism.
+        uabd            v4.16b, v1.16b, v23.16b     // absolute difference 0..15, i=0
+        uabd            v5.16b, v16.16b, v26.16b    // absolute difference 0..15, i=1
+        uabd            v6.16b, v17.16b, v28.16b    // absolute difference 0..15, i=2
+        uabd            v7.16b, v20.16b, v30.16b    // absolute difference 0..15, i=3
+
+        cmp             w4, #4                      // loop if h >= 4
+
+        // Now add up all the values in each vector, v4-v7 with widening adds
+        uaddl           v19.8h, v4.8b, v5.8b
+        uaddl2          v18.8h, v4.16b, v5.16b
+        uaddl           v4.8h, v6.8b, v7.8b
+        uaddl2          v5.8h, v6.16b, v7.16b
+        add             v4.8h, v4.8h, v5.8h
+        add             v4.8h, v4.8h, v18.8h
+        add             v4.8h, v4.8h, v19.8h
+        uaddlv          s4, v4.8h                   // finish adding up accumulated values
+        add             d0, d0, d4                  // add the value to the top level accumulator
+
+        b.ge            1b
+        cbnz            w4, 2f                      // if iterations remain jump to completion section
+
+        fmov            w0, s0                      // copy result to general purpose register
+        ret
+2:
+        // v2 and v3 are set either at the end of this loop or at from the unrolled version
+        // which branches here to complete iterations when h % 4 != 0.
+        ldr             q5, [x5, #1]                // load pix3+1
+        ld1             {v4.16b}, [x5], x3          // load pix3
+        ld1             {v1.16b}, [x1], x3          // load pix1
+        subs            w4, w4, #1                  // decrement h
+
+        uaddl           v18.8h, v4.8b, v5.8b        // pix3 + pix3+1 0..7
+        uaddl2          v19.8h, v4.16b, v5.16b      // pix3 + pix3+1 8..15
+        add             v16.8h, v2.8h, v18.8h       // add up 0..7, using pix2 + pix2+1 values from previous iteration
+        add             v17.8h, v3.8h, v19.8h       // add up 8..15, using pix2 + pix2+1 values from previous iteration
+        // divide by 4 to compute the average of values summed above
+        urshr           v16.8h, v16.8h, #2          // shift right by 2 0..7 (rounding shift right)
+        urshr           v17.8h, v17.8h, #2          // shift right by 2 8..15
+
+        uxtl2           v8.8h, v1.16b               // 8->16 bits pix1 8..15
+        uxtl            v1.8h, v1.8b                // 8->16 bits pix1 0..7
+
+        uabd            v6.8h, v1.8h, v16.8h        // absolute difference 0..7
+        uaba            v6.8h, v8.8h, v17.8h        // absolute difference accumulate 8..15
+        mov             v2.16b, v18.16b             // pix3 -> pix2
+        mov             v3.16b, v19.16b             // pix3+1 -> pix2+1
+        uaddlv          s6, v6.8h                   // add up accumulator in v6
+        add             d0, d0, d6                  // add to the final result
+
+        b.ne            2b                          // loop if h > 0
+        fmov            w0, s0                      // copy result to general purpose register
+        ret
+endfunc
diff --git a/libavcodec/me_cmp.c b/libavcodec/me_cmp.c
index 4407d0a7e9..d0cd14ad33 100644
--- a/libavcodec/me_cmp.c
+++ b/libavcodec/me_cmp.c
@@ -1061,7 +1061,9 @@  av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
     ff_dsputil_init_dwt(c);
 #endif
 
-#if ARCH_ALPHA
+#if ARCH_AARCH64
+    ff_me_cmp_init_aarch64(c, avctx);
+#elif ARCH_ALPHA
     ff_me_cmp_init_alpha(c, avctx);
 #elif ARCH_ARM
     ff_me_cmp_init_arm(c, avctx);
diff --git a/libavcodec/me_cmp.h b/libavcodec/me_cmp.h
index e9b5161c9a..7b057a923b 100644
--- a/libavcodec/me_cmp.h
+++ b/libavcodec/me_cmp.h
@@ -80,6 +80,7 @@  typedef struct MECmpContext {
 } MECmpContext;
 
 void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx);
+void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx);
diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index f6b1008855..e869c70b55 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -12,6 +12,7 @@  AVCODECOBJS-$(CONFIG_H264QPEL)          += h264qpel.o
 AVCODECOBJS-$(CONFIG_IDCTDSP)           += idctdsp.o
 AVCODECOBJS-$(CONFIG_LLVIDDSP)          += llviddsp.o
 AVCODECOBJS-$(CONFIG_LLVIDENCDSP)       += llviddspenc.o
+AVCODECOBJS-$(CONFIG_ME_CMP)            += motion.o
 AVCODECOBJS-$(CONFIG_VC1DSP)            += vc1dsp.o
 AVCODECOBJS-$(CONFIG_VP8DSP)            += vp8dsp.o
 AVCODECOBJS-$(CONFIG_VIDEODSP)          += videodsp.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 57134f96ea..5ffcafbda9 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -135,6 +135,9 @@  static const struct {
     #if CONFIG_LLVIDENCDSP
         { "llviddspenc", checkasm_check_llviddspenc },
     #endif
+    #if CONFIG_ME_CMP
+        { "motion", checkasm_check_motion },
+    #endif
     #if CONFIG_OPUS_DECODER
         { "opusdsp", checkasm_check_opusdsp },
     #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index a86db140e3..b601a98754 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -68,6 +68,7 @@  void checkasm_check_idctdsp(void);
 void checkasm_check_jpeg2000dsp(void);
 void checkasm_check_llviddsp(void);
 void checkasm_check_llviddspenc(void);
+void checkasm_check_motion(void);
 void checkasm_check_nlmeans(void);
 void checkasm_check_opusdsp(void);
 void checkasm_check_pixblockdsp(void);
diff --git a/tests/checkasm/motion.c b/tests/checkasm/motion.c
new file mode 100644
index 0000000000..f337dd6f95
--- /dev/null
+++ b/tests/checkasm/motion.c
@@ -0,0 +1,151 @@ 
+/*
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+
+#include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+#include "libavcodec/me_cmp.h"
+
+#include "checkasm.h"
+
+static void fill_random(uint8_t *tab, int size)
+{
+    int i;
+    for (i = 0; i < size; i++) {
+        tab[i] = rnd() % 256;
+    }
+}
+
+static void test_motion(const char *name, me_cmp_func test_func)
+{
+    /* test configurarion */
+#define ITERATIONS 16
+#define WIDTH 64
+#define HEIGHT 64
+
+    /* motion estimation can look up to 17 bytes ahead */
+    static const int look_ahead = 17;
+
+    int i, x, y, d1, d2;
+    uint8_t *ptr;
+
+    LOCAL_ALIGNED_8(uint8_t, img1, [WIDTH * HEIGHT]);
+    LOCAL_ALIGNED_8(uint8_t, img2, [WIDTH * HEIGHT]);
+
+    declare_func_emms(AV_CPU_FLAG_MMX, int, struct MpegEncContext *c,
+                      uint8_t *blk1 /* align width (8 or 16) */,
+                      uint8_t *blk2 /* align 1 */, ptrdiff_t stride,
+                      int h);
+
+    if (test_func == NULL) {
+        return;
+    }
+
+    /* test correctness */
+    fill_random(img1, WIDTH * HEIGHT);
+    fill_random(img2, WIDTH * HEIGHT);
+
+    if (check_func(test_func, "%s", name)) {
+        for (i = 0; i < ITERATIONS; i++) {
+            x = rnd() % (WIDTH - look_ahead);
+            y = rnd() % (HEIGHT - look_ahead);
+
+            ptr = img2 + y * WIDTH + x;
+            d2 = call_ref(NULL, img1, ptr, WIDTH, 8);
+            d1 = call_new(NULL, img1, ptr, WIDTH, 8);
+
+            if (d1 != d2) {
+                fail();
+                printf("func: %s, x=%d y=%d, error: asm=%d c=%d\n", name, x, y, d1, d2);
+                break;
+            }
+        }
+        // benchmark with the final value of ptr
+        bench_new(NULL, img1, ptr, WIDTH, 8);
+    }
+}
+
+#define ME_CMP_1D_ARRAYS(XX)                                                   \
+    XX(sad)                                                                    \
+    XX(sse)                                                                    \
+    XX(hadamard8_diff)                                                         \
+    XX(vsad)                                                                   \
+    XX(vsse)                                                                   \
+    XX(nsse)                                                                   \
+    XX(me_pre_cmp)                                                             \
+    XX(me_cmp)                                                                 \
+    XX(me_sub_cmp)                                                             \
+    XX(mb_cmp)                                                                 \
+    XX(ildct_cmp)                                                              \
+    XX(frame_skip_cmp)                                                         \
+    XX(median_sad)
+
+// tests for functions not yet implemented
+#if 0
+    XX(dct_sad)                                                                \
+    XX(quant_psnr)                                                             \
+    XX(bit)                                                                    \
+    XX(rd)                                                                     \
+    XX(w53)                                                                    \
+    XX(w97)                                                                    \
+    XX(dct_max)                                                                \
+    XX(dct264_sad)                                                             \
+
+#endif
+
+static void check_motion(void)
+{
+    char buf[64];
+    AVCodecContext *av_ctx;
+    MECmpContext me_ctx;
+
+    memset(&me_ctx, 0, sizeof(me_ctx));
+
+    /* allocate AVCodecContext */
+    av_ctx = avcodec_alloc_context3(NULL);
+    av_ctx->flags |= AV_CODEC_FLAG_BITEXACT;
+
+    ff_me_cmp_init(&me_ctx, av_ctx);
+
+    for (int i = 0; i < FF_ARRAY_ELEMS(me_ctx.pix_abs); i++) {
+        for (int j = 0; j < FF_ARRAY_ELEMS(me_ctx.pix_abs[0]); j++) {
+            snprintf(buf, sizeof(buf), "pix_abs_%d_%d", i, j);
+            test_motion(buf, me_ctx.pix_abs[i][j]);
+        }
+    }
+
+#define XX(me_cmp_array)                                                        \
+    for (int i = 0; i < FF_ARRAY_ELEMS(me_ctx.me_cmp_array); i++) {             \
+        snprintf(buf, sizeof(buf), #me_cmp_array "_%d", i);                     \
+        test_motion(buf, me_ctx.me_cmp_array[i]);                               \
+    }
+    ME_CMP_1D_ARRAYS(XX)
+#undef XX
+
+    avcodec_free_context(&av_ctx);
+}
+
+void checkasm_check_motion(void)
+{
+    check_motion();
+    report("motion");
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index c6273db183..4d2f321e84 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -23,6 +23,7 @@  FATE_CHECKASM = fate-checkasm-aacpsdsp                                  \
                 fate-checkasm-jpeg2000dsp                               \
                 fate-checkasm-llviddsp                                  \
                 fate-checkasm-llviddspenc                               \
+                fate-checkasm-motion                                    \
                 fate-checkasm-opusdsp                                   \
                 fate-checkasm-pixblockdsp                               \
                 fate-checkasm-sbrdsp                                    \