diff mbox series

[FFmpeg-devel,v3,1/1] lavc/aarch64: motion estimation functions in neon

Message ID 3def712d3c4f4551bdbf15166eecde35@EX13D07UWB004.ant.amazon.com
State New
Headers show
Series lavc/aarch64: motion estimation functions in neon | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
andriy/make_armv7_RPi4 success Make finished
andriy/make_fate_armv7_RPi4 success Make fate finished

Commit Message

Swinney, Jonathan June 18, 2022, 1:56 a.m. UTC
- ff_pix_abs16_neon
 - ff_pix_abs16_xy2_neon

In direct micro benchmarks of these ff functions verses their C implementations,
these functions performed as follows on AWS Graviton 3.

ff_pix_abs16_neon:
pix_abs_0_0_c: 135.3
pix_abs_0_0_neon: 22.0

ff_pix_abs16_xy2_neon:
pix_abs_0_3_c: 264.5
pix_abs_0_3_neon: 46.8

Tested with:
```
./tests/checkasm/checkasm --test=motion  --bench
```

Signed-off-by: Jonathan Swinney <jswinney@amazon.com>
---
 libavcodec/aarch64/Makefile              |   2 +
 libavcodec/aarch64/me_cmp_init_aarch64.c |  39 +++++
 libavcodec/aarch64/me_cmp_neon.S         | 209 +++++++++++++++++++++++
 libavcodec/me_cmp.c                      |   4 +-
 libavcodec/me_cmp.h                      |   1 +
 tests/checkasm/Makefile                  |   1 +
 tests/checkasm/checkasm.c                |   3 +
 tests/checkasm/checkasm.h                |   1 +
 tests/checkasm/motion.c                  | 144 ++++++++++++++++
 tests/fate/checkasm.mak                  |   1 +
 10 files changed, 404 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/aarch64/me_cmp_init_aarch64.c
 create mode 100644 libavcodec/aarch64/me_cmp_neon.S
 create mode 100644 tests/checkasm/motion.c

Comments

Martin Storsjö June 21, 2022, 7:36 p.m. UTC | #1
On Sat, 18 Jun 2022, Swinney, Jonathan wrote:

> - ff_pix_abs16_neon
> - ff_pix_abs16_xy2_neon
>
> In direct micro benchmarks of these ff functions verses their C implementations,
> these functions performed as follows on AWS Graviton 3.
>
> ff_pix_abs16_neon:
> pix_abs_0_0_c: 135.3
> pix_abs_0_0_neon: 22.0
>
> ff_pix_abs16_xy2_neon:
> pix_abs_0_3_c: 264.5
> pix_abs_0_3_neon: 46.8
>
> Tested with:
> ```
> ./tests/checkasm/checkasm --test=motion  --bench
> ```
>
> Signed-off-by: Jonathan Swinney <jswinney@amazon.com>
> ---
> libavcodec/aarch64/Makefile              |   2 +
> libavcodec/aarch64/me_cmp_init_aarch64.c |  39 +++++
> libavcodec/aarch64/me_cmp_neon.S         | 209 +++++++++++++++++++++++
> libavcodec/me_cmp.c                      |   4 +-
> libavcodec/me_cmp.h                      |   1 +
> tests/checkasm/Makefile                  |   1 +
> tests/checkasm/checkasm.c                |   3 +
> tests/checkasm/checkasm.h                |   1 +
> tests/checkasm/motion.c                  | 144 ++++++++++++++++
> tests/fate/checkasm.mak                  |   1 +
> 10 files changed, 404 insertions(+), 1 deletion(-)
> create mode 100644 libavcodec/aarch64/me_cmp_init_aarch64.c
> create mode 100644 libavcodec/aarch64/me_cmp_neon.S
> create mode 100644 tests/checkasm/motion.c

Overall, this looks pretty good, thanks! There's a couple minor things 
still left to discuss.


> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> new file mode 100644
> index 0000000000..3b48cb156d
> --- /dev/null
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -0,0 +1,209 @@
> +
> +function ff_pix_abs16_neon, export=1
> +        // x0           unused
> +        // x1           uint8_t *pix1
> +        // x2           uint8_t *pix2
> +        // x3           ptrdiff_t stride
> +        // w4           int h
> +        // x5           uint8_t *pix3

This register isn't used in this function, so it's mostly confusing to 
keep it listed here.

> +        cmp             w4, #4                      // if h < 4, jump to completion section
> +        movi            v18.4S, #0                  // clear result accumulator
> +        b.lt            2f
> +1:
> +        movi            v16.8h, #0                  // clear uabal accumulator
> +        ld1             {v0.16b}, [x1], x3          // load pix1
> +        ld1             {v4.16b}, [x2], x3          // load pix2
> +        ld1             {v1.16b}, [x1], x3          // load pix1
> +        ld1             {v5.16b}, [x2], x3          // load pix2
> +        uabal           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
> +        uabal2          v16.8h, v0.16b, v4.16b

I guess this first one could be uabdl, which would save the movi v16.8h, 
#0? I guess it doesn't make any practical difference though.

> +        ld1             {v2.16b}, [x1], x3          // load pix1
> +        ld1             {v6.16b}, [x2], x3          // load pix2
> +        uabal           v16.8h, v1.8b, v5.8b        // absolute difference accumulate
> +        uabal2          v16.8h, v1.16b, v5.16b

With two consecutive updates to v16.8h like this, I'm afraid that the 
second one ends up blocking waiting for the result from the first one. 
Would this be faster if using two accumulator registers in parallel, 
adding them to each other only at the end?

> +        ld1             {v3.16b}, [x1], x3
> +        ld1             {v7.16b}, [x2], x3
> +        uabal           v16.8h, v2.8b, v6.8b
> +        uabal2          v16.8h, v2.16b, v6.16b
> +        sub             w4, w4, #4                  // h -= 4
> +        uabal           v16.8h, v3.8b, v7.8b
> +        uabal2          v16.8h, v3.16b, v7.16b
> +        cmp             w4, #4                      // if h >= 4, loop
> +        uaddlv          s17, v16.8h                 // add up everything in v16 accumulator
> +        add             d18, d17, d18               // add to the end result register
> +
> +        b.ge            1b
> +        cbnz            w4, 2f                      // if iterations remain, jump to completion section
> +
> +        fmov            w0, s18                     // copy result to general purpose register
> +        ret
> +
> +2:
> +        movi            v16.8h, #0                  // clear the uabal accumulator
> +        ld1             {v0.16b}, [x1], x3          // load pix1
> +        ld1             {v4.16b}, [x2], x3          // load pix2
> +        uabal           v16.8h, v0.8b, v4.8b        // absolute difference accumulate

Same thing about using uabdl for the first case.

> +        uabal2          v16.8h, v0.16b, v4.16b
> +        addv            h17, v16.8h                 // add up v16
> +        add             d18, d17, d18               // add to result
> +        subs            w4, w4, #1                  // h -= 1
> +        b.ne            2b

I think this would work better on in-order cores if you'd move the subs up 
between addv and add, or after the two ld1.

> +
> +        fmov            w0, s18                     // copy result to general purpose register
> +        ret
> +endfunc
> +
> +function ff_pix_abs16_xy2_neon, export=1
> +        // x0           unused
> +        // x1           uint8_t *pix1
> +        // x2           uint8_t *pix2
> +        // x3           ptrdiff_t stride
> +        // w4           int h
> +        // x5           uint8_t *pix3

First I wondered whether pix3 was a parameter (which it isn't), but having 
the local register allocation documented like this is good of course. 
Would it be possible to make it clearer somehow that this is a local 
variable, not a parameter?

> +        add             x5, x2, x3                  // create a pointer for pix3
> +        movi            v0.2d, #0                   // initialize the result register
> +
> +        // Load initial pix2 values for either the unrolled version or completion version.
> +        ldr             q4, [x2, #1]                // load pix2+1

With Microsoft armasm64, this fails - you need to use ldur instead of ldr 
here (which is what GAS and llvm assemble it to implicitly). Same thing in 
all the other cases with offset below.

> +        ldr             q3, [x2]                    // load pix2
> +        uaddl           v2.8h, v4.8b, v3.8b         // pix2 + pix2+1 0..7
> +        uaddl2          v3.8h, v4.16b, v3.16b       // pix2 + pix2+1 8..15
> +        cmp             w4, #4                      // if h < 4 jump to the completion version
> +        b.lt            2f
> +1:
> +        // This is an unrolled implemntation. It completes 4 iterations of the C for each branch.
> +        // In each iteration, pix2[i+1] == pix3[i]. This means we need only three loads per iteration,
> +        // plus two at the begining to start.
> +        ldr             q5, [x5, #1]                // load pix3+1
> +        ld1             {v4.16b}, [x5], x3          // load pix3
> +        ld1             {v1.16b}, [x1], x3          // load pix1
> +
> +        ldr             q7, [x5, #1]                // load pix3+1
> +        ld1             {v6.16b}, [x5], x3          // load pix3
> +        ld1             {v16.16b}, [x1], x3         // load pix1
> +
> +        ldr             q19, [x5, #1]               // load pix3+1
> +        ld1             {v18.16b}, [x5], x3         // load pix3
> +        ld1             {v17.16b}, [x1], x3         // load pix1
> +
> +        ldr             q22, [x5, #1]               // load pix3+1
> +        ld1             {v21.16b}, [x5], x3         // load pix3
> +        ld1             {v20.16b}, [x1], x3         // load pix1
> +
> +        // These blocks compute the average: avg(pix2[n], pix2[n+1], pix3[n], pix3[n+1])
> +        uaddl           v30.8h, v4.8b, v5.8b        // pix3 + pix3+1 0..7
> +        uaddl2          v31.8h, v4.16b, v5.16b      // pix3 + pix3+1 8..15
> +        add             v23.8h, v2.8h, v30.8h       // add up 0..7, using pix2 + pix2+1 values from previous iteration
> +        add             v24.8h, v3.8h, v31.8h       // add up 8..15, using pix2 + pix2+1 values from previous iteration
> +        urshr           v23.8h, v23.8h, #2          // shift right 2 0..7 (rounding shift right)
> +        urshr           v24.8h, v24.8h, #2          // shift right 2 8..15
> +
> +        uaddl           v2.8h, v6.8b, v7.8b         // pix3 + pix3+1 0..7
> +        uaddl2          v3.8h, v6.16b, v7.16b       // pix3 + pix3+1 8..15
> +        add             v26.8h, v30.8h, v2.8h       // add up 0..7, using pix2 + pix2+1 values from pix3 above
> +        add             v27.8h, v31.8h, v3.8h       // add up 8..15, using pix2 + pix2+1 values from pix3 above
> +        urshr           v26.8h, v26.8h, #2          // shift right 2 0..7 (rounding shift right)
> +        urshr           v27.8h, v27.8h, #2          // shift right 2 8..15
> +
> +        uaddl           v4.8h, v18.8b, v19.8b       // pix3 + pix3+1 0..7
> +        uaddl2          v5.8h, v18.16b, v19.16b     // pix3 + pix3+1 8..15
> +        add             v28.8h, v2.8h, v4.8h        // add up 0..7, using pix2 + pix2+1 values from pix3 above
> +        add             v29.8h, v3.8h, v5.8h        // add up 8..15, using pix2 + pix2+1 values from pix3 above
> +        urshr           v28.8h, v28.8h, #2          // shift right 2 0..7 (rounding shift right)
> +        urshr           v29.8h, v29.8h, #2          // shift right 2 8..15
> +
> +        uaddl           v2.8h, v21.8b, v22.8b       // pix3 + pix3+1 0..7
> +        uaddl2          v3.8h, v21.16b, v22.16b     // pix3 + pix3+1 8..15
> +        add             v30.8h, v4.8h, v2.8h        // add up 0..7, using pix2 + pix2+1 values from pix3 above
> +        add             v31.8h, v5.8h, v3.8h        // add up 8..15, using pix2 + pix2+1 values from pix3 above
> +        urshr           v30.8h, v30.8h, #2          // shift right 2 0..7 (rounding shift right)
> +        urshr           v31.8h, v31.8h, #2          // shift right 2 8..15
> +
> +        // Averages are now stored in these registers:
> +        // v23, v24
> +        // v26, v27
> +        // v28, v29
> +        // v30, v31
> +        // pix1 values in these registers:
> +        // v1, v16, v17, v20
> +        // available
> +        // v4, v5, v7, v16, v18, v19, v25
> +
> +        uxtl2           v4.8h, v1.16b               // 8->16 bits pix1 8..15
> +        uxtl            v1.8h, v1.8b                // 8->16 bits pix1 0..7
> +        uxtl2           v7.8h, v16.16b              // 8->16 bits pix1 8..15
> +        uxtl            v6.8h, v16.8b               // 8->16 bits pix1 0..7
> +        uxtl2           v18.8h, v17.16b             // 8->16 bits pix1 8..15
> +        uxtl            v17.8h, v17.8b              // 8->16 bits pix1 0..7
> +        uxtl2           v25.8h, v20.16b             // 8->16 bits pix1 8..15
> +        uxtl            v20.8h, v20.8b              // 8->16 bits pix1 0..7

Instead of widening the pix1 values here, would it be possible to narrow 
the averages from pix2/pix3 by using rshrn instead of urshr above? Then 
you'd end up needing uabdl/uabal here below (resulting in the same number 
of instructions), but you could possibly get rid of all the uxtl above.

> +        uabd            v5.8h, v1.8h, v23.8h        // absolute difference 0..7
> +        uaba            v5.8h, v4.8h, v24.8h        // absolute difference accumulate 8..15
> +        uaba            v5.8h, v6.8h, v26.8h        // absolute difference accumulate 0..7
> +        uaba            v5.8h, v7.8h, v27.8h        // absolute difference accumulate 8..15
> +        uaba            v5.8h, v17.8h, v28.8h       // absolute difference accumulate 0..7
> +        uaba            v5.8h, v18.8h, v29.8h       // absolute difference accumulate 8..15
> +        uaba            v5.8h, v20.8h, v30.8h       // absolute difference accumulate 0..7
> +        uaba            v5.8h, v25.8h, v31.8h       // absolute difference accumulate 8..15
> +
> +        uaddlv          s5, v5.8h                   // add up accumulated values
> +        sub             w4, w4, #4                  // h -= 4
> +        add             d0, d0, d5                  // add to final result
> +        cmp             w4, #4                      // loop if h >= 4
> +        b.ge            1b

Please move the sub and cmp further up, so that the cmp doesn't directly 
precede the branch that depends on its result.

> +        cbnz            w4, 2f                      // if iterations remain jump to completion section
> +
> +        fmov            w0, s0                      // copy result to general purpose register
> +        ret
> +2:
> +        // v2 and v3 are set either at the end of this loop or at from the unrolled version
> +        // which branches here to complete iterations when h % 4 != 0.
> +        ldr             q5, [x5, #1]                // load pix3+1
> +        ld1             {v4.16b}, [x5], x3          // load pix3
> +        ld1             {v1.16b}, [x1], x3          // load pix1
> +        sub             w4, w4, #1                  // decrement h
> +
> +        uaddl           v18.8h, v4.8b, v5.8b        // pix3 + pix3+1 0..7
> +        uaddl2          v19.8h, v4.16b, v5.16b      // pix3 + pix3+1 8..15
> +        add             v16.8h, v2.8h, v18.8h       // add up 0..7, using pix2 + pix2+1 values from previous iteration
> +        add             v17.8h, v3.8h, v19.8h       // add up 8..15, using pix2 + pix2+1 values from previous iteration
> +        // divide by 4 to compute the average of values summed above
> +        urshr           v16.8h, v16.8h, #2          // shift right by 2 0..7 (rounding shift right)
> +        urshr           v17.8h, v17.8h, #2          // shift right by 2 8..15
> +
> +        uxtl2           v8.8h, v1.16b               // 8->16 bits pix1 8..15
> +        uxtl            v1.8h, v1.8b                // 8->16 bits pix1 0..7
> +
> +        uabd            v6.8h, v1.8h, v16.8h        // absolute difference 0..7
> +        uaba            v6.8h, v8.8h, v17.8h        // absolute difference accumulate 8..15
> +        mov             v2.16b, v18.16b             // pix3 -> pix2
> +        mov             v3.16b, v19.16b             // pix3+1 -> pix2+1
> +        addv            h6, v6.8h                   // add up accumulator in v6
> +        add             d0, d0, d6                  // add to the final result
> +
> +        cbnz            w4, 2b                      // loop if h > 0

Since we already do sub above, I think it'd be faster and/or more 
idiomatic by making that "subs" and changing this into "b.ne 2b".

> +        fmov            w0, s0                      // copy result to general purpose register
> +        ret
> +endfunc

> diff --git a/tests/checkasm/motion.c b/tests/checkasm/motion.c
> new file mode 100644
> index 0000000000..c2b4421325
> --- /dev/null
> +++ b/tests/checkasm/motion.c
> @@ -0,0 +1,144 @@
> +
> +#include <string.h>
> +
> +#include "libavutil/common.h"
> +#include "libavutil/intreadwrite.h"
> +#include "libavutil/mem_internal.h"
> +
> +#include "libavcodec/me_cmp.h"
> +
> +#include "checkasm.h"
> +
> +#define WIDTH 64
> +#define HEIGHT 64
> +
> +static void fill_random(uint8_t *tab, int size)
> +{
> +    int i;
> +    for(i=0;i<size;i++) {

Spacing around parentheses and operators

> +        tab[i] = rnd() % 256;
> +    }
> +}
> +
> +static void test_motion(const char *name, me_cmp_func test_func)
> +{
> +    int x, y, d1, d2;
> +    uint8_t *ptr;
> +
> +    LOCAL_ALIGNED_8(uint8_t, img1, [WIDTH * HEIGHT]);
> +    LOCAL_ALIGNED_8(uint8_t, img2, [WIDTH * HEIGHT]);
> +
> +    declare_func_emms(AV_CPU_FLAG_MMX, int, struct MpegEncContext *c,
> +                      uint8_t *blk1 /* align width (8 or 16) */,
> +                      uint8_t *blk2 /* align 1 */, ptrdiff_t stride,
> +                      int h);
> +
> +    if (test_func == NULL) {
> +        return;
> +    }
> +
> +    /* test correctness */
> +    fill_random(img1, WIDTH * HEIGHT);
> +    fill_random(img2, WIDTH * HEIGHT);
> +
> +    if (check_func(test_func, "%s", name)) {
> +        for (y = 0; y < HEIGHT - 17; y++) {
> +            for (x = 0; x < WIDTH - 17; x++) {
> +                ptr = img2 + y * WIDTH + x;

I'm a bit unsure why we need to do 47x47 iterations of this, just to test 
various random offsets. The normal way would be to do e.g. one or a couple 
iterations with "x = rnd() % (WIDTH - 17)" instead.

> +                d2 = call_ref(NULL, img1, ptr, WIDTH, 8);
> +                d1 = call_new(NULL, img1, ptr, WIDTH, 8);
> +
> +                if (d1 != d2) {
> +                    fail();
> +                    printf("func: %s, x=%d y=%d, error: asm=%d c=%d\n", name, x, y, d1, d2);
> +                    break;
> +                }
> +            }
> +        }
> +        // benchmark with the final value of ptr
> +        bench_new(NULL, img1, ptr, WIDTH, 8);
> +    }
> +}
> +
> +#define ME_CMP_1D_ARRAYS(XX)                                                   \
> +    XX(sad)                                                                    \
> +    XX(sse)                                                                    \
> +    XX(hadamard8_diff)                                                         \
> +    XX(vsad)                                                                   \
> +    XX(vsse)                                                                   \
> +    XX(nsse)                                                                   \
> +    XX(me_pre_cmp)                                                             \
> +    XX(me_cmp)                                                                 \
> +    XX(me_sub_cmp)                                                             \
> +    XX(mb_cmp)                                                                 \
> +    XX(ildct_cmp)                                                              \
> +    XX(frame_skip_cmp)                                                         \
> +    XX(median_sad)
> +
> +// tests for functions not yet implemented
> +#if 0
> +    XX(dct_sad)                                                                \
> +    XX(quant_psnr)                                                             \
> +    XX(bit)                                                                    \
> +    XX(rd)                                                                     \
> +    XX(w53)                                                                    \
> +    XX(w97)                                                                    \
> +    XX(dct_max)                                                                \
> +    XX(dct264_sad)                                                             \
> +
> +#endif
> +
> +static void check_motion(void)
> +{
> +    char buf[64];
> +    AVCodecContext *av_ctx;
> +    MECmpContext me_ctx;
> +
> +    memset(&me_ctx, 0, sizeof(me_ctx));
> +
> +    /* allocate AVCodecContext */
> +    av_ctx = avcodec_alloc_context3(NULL);
> +    av_ctx->flags |= AV_CODEC_FLAG_BITEXACT;
> +
> +    ff_me_cmp_init(&me_ctx, av_ctx);
> +
> +    for (int i = 0; i < FF_ARRAY_ELEMS(me_ctx.pix_abs); i++) {
> +        for (int j = 0; j < FF_ARRAY_ELEMS(me_ctx.pix_abs[0]); j++) {
> +            snprintf(buf, sizeof(buf), "pix_abs_%d_%d", i, j);
> +            test_motion(buf, me_ctx.pix_abs[i][j]);
> +        }
> +    }
> +
> +#define XX(me_cmp_array)                                                        \
> +    for (int i = 0; i < FF_ARRAY_ELEMS(me_ctx.me_cmp_array); i++) {               \
> +        snprintf(buf, sizeof(buf), #me_cmp_array "_%d", i);                     \
> +        test_motion(buf, me_ctx.me_cmp_array[i]);                               \
> +    }

The backslashes are uneven here (or there's mixed whitespace types?)

> +    ME_CMP_1D_ARRAYS(XX)
> +#undef XX
> +

It'd be good to free av_ctx here too.


Other than that, this looks good, thanks!

// Martin
Swinney, Jonathan June 26, 2022, 8:57 p.m. UTC | #2
> Instead of widening the pix1 values here, would it be possible to narrow
    > the averages from pix2/pix3 by using rshrn instead of urshr above? Then
    > you'd end up needing uabdl/uabal here below (resulting in the same number
    > of instructions), but you could possibly get rid of all the uxtl above.

Thanks for the idea. I experimented with this and I was able to shave a bit of execution time off my previous version. I am about to submit another patch to address your comments. Thanks for taking a look!

-- 

Jonathan Swinney

On 6/21/22, 2:40 PM, "ffmpeg-devel on behalf of Martin Storsjö" <ffmpeg-devel-bounces@ffmpeg.org on behalf of martin@martin.st> wrote:

    CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.



    On Sat, 18 Jun 2022, Swinney, Jonathan wrote:

    > - ff_pix_abs16_neon
    > - ff_pix_abs16_xy2_neon
    >
    > In direct micro benchmarks of these ff functions verses their C implementations,
    > these functions performed as follows on AWS Graviton 3.
    >
    > ff_pix_abs16_neon:
    > pix_abs_0_0_c: 135.3
    > pix_abs_0_0_neon: 22.0
    >
    > ff_pix_abs16_xy2_neon:
    > pix_abs_0_3_c: 264.5
    > pix_abs_0_3_neon: 46.8
    >
    > Tested with:
    > ```
    > ./tests/checkasm/checkasm --test=motion  --bench
    > ```
    >
    > Signed-off-by: Jonathan Swinney <jswinney@amazon.com>
    > ---
    > libavcodec/aarch64/Makefile              |   2 +
    > libavcodec/aarch64/me_cmp_init_aarch64.c |  39 +++++
    > libavcodec/aarch64/me_cmp_neon.S         | 209 +++++++++++++++++++++++
    > libavcodec/me_cmp.c                      |   4 +-
    > libavcodec/me_cmp.h                      |   1 +
    > tests/checkasm/Makefile                  |   1 +
    > tests/checkasm/checkasm.c                |   3 +
    > tests/checkasm/checkasm.h                |   1 +
    > tests/checkasm/motion.c                  | 144 ++++++++++++++++
    > tests/fate/checkasm.mak                  |   1 +
    > 10 files changed, 404 insertions(+), 1 deletion(-)
    > create mode 100644 libavcodec/aarch64/me_cmp_init_aarch64.c
    > create mode 100644 libavcodec/aarch64/me_cmp_neon.S
    > create mode 100644 tests/checkasm/motion.c

    Overall, this looks pretty good, thanks! There's a couple minor things
    still left to discuss.


    > diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
    > new file mode 100644
    > index 0000000000..3b48cb156d
    > --- /dev/null
    > +++ b/libavcodec/aarch64/me_cmp_neon.S
    > @@ -0,0 +1,209 @@
    > +
    > +function ff_pix_abs16_neon, export=1
    > +        // x0           unused
    > +        // x1           uint8_t *pix1
    > +        // x2           uint8_t *pix2
    > +        // x3           ptrdiff_t stride
    > +        // w4           int h
    > +        // x5           uint8_t *pix3

    This register isn't used in this function, so it's mostly confusing to
    keep it listed here.

    > +        cmp             w4, #4                      // if h < 4, jump to completion section
    > +        movi            v18.4S, #0                  // clear result accumulator
    > +        b.lt            2f
    > +1:
    > +        movi            v16.8h, #0                  // clear uabal accumulator
    > +        ld1             {v0.16b}, [x1], x3          // load pix1
    > +        ld1             {v4.16b}, [x2], x3          // load pix2
    > +        ld1             {v1.16b}, [x1], x3          // load pix1
    > +        ld1             {v5.16b}, [x2], x3          // load pix2
    > +        uabal           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
    > +        uabal2          v16.8h, v0.16b, v4.16b

    I guess this first one could be uabdl, which would save the movi v16.8h,
    #0? I guess it doesn't make any practical difference though.

    > +        ld1             {v2.16b}, [x1], x3          // load pix1
    > +        ld1             {v6.16b}, [x2], x3          // load pix2
    > +        uabal           v16.8h, v1.8b, v5.8b        // absolute difference accumulate
    > +        uabal2          v16.8h, v1.16b, v5.16b

    With two consecutive updates to v16.8h like this, I'm afraid that the
    second one ends up blocking waiting for the result from the first one.
    Would this be faster if using two accumulator registers in parallel,
    adding them to each other only at the end?

    > +        ld1             {v3.16b}, [x1], x3
    > +        ld1             {v7.16b}, [x2], x3
    > +        uabal           v16.8h, v2.8b, v6.8b
    > +        uabal2          v16.8h, v2.16b, v6.16b
    > +        sub             w4, w4, #4                  // h -= 4
    > +        uabal           v16.8h, v3.8b, v7.8b
    > +        uabal2          v16.8h, v3.16b, v7.16b
    > +        cmp             w4, #4                      // if h >= 4, loop
    > +        uaddlv          s17, v16.8h                 // add up everything in v16 accumulator
    > +        add             d18, d17, d18               // add to the end result register
    > +
    > +        b.ge            1b
    > +        cbnz            w4, 2f                      // if iterations remain, jump to completion section
    > +
    > +        fmov            w0, s18                     // copy result to general purpose register
    > +        ret
    > +
    > +2:
    > +        movi            v16.8h, #0                  // clear the uabal accumulator
    > +        ld1             {v0.16b}, [x1], x3          // load pix1
    > +        ld1             {v4.16b}, [x2], x3          // load pix2
    > +        uabal           v16.8h, v0.8b, v4.8b        // absolute difference accumulate

    Same thing about using uabdl for the first case.

    > +        uabal2          v16.8h, v0.16b, v4.16b
    > +        addv            h17, v16.8h                 // add up v16
    > +        add             d18, d17, d18               // add to result
    > +        subs            w4, w4, #1                  // h -= 1
    > +        b.ne            2b

    I think this would work better on in-order cores if you'd move the subs up
    between addv and add, or after the two ld1.

    > +
    > +        fmov            w0, s18                     // copy result to general purpose register
    > +        ret
    > +endfunc
    > +
    > +function ff_pix_abs16_xy2_neon, export=1
    > +        // x0           unused
    > +        // x1           uint8_t *pix1
    > +        // x2           uint8_t *pix2
    > +        // x3           ptrdiff_t stride
    > +        // w4           int h
    > +        // x5           uint8_t *pix3

    First I wondered whether pix3 was a parameter (which it isn't), but having
    the local register allocation documented like this is good of course.
    Would it be possible to make it clearer somehow that this is a local
    variable, not a parameter?

    > +        add             x5, x2, x3                  // create a pointer for pix3
    > +        movi            v0.2d, #0                   // initialize the result register
    > +
    > +        // Load initial pix2 values for either the unrolled version or completion version.
    > +        ldr             q4, [x2, #1]                // load pix2+1

    With Microsoft armasm64, this fails - you need to use ldur instead of ldr
    here (which is what GAS and llvm assemble it to implicitly). Same thing in
    all the other cases with offset below.

    > +        ldr             q3, [x2]                    // load pix2
    > +        uaddl           v2.8h, v4.8b, v3.8b         // pix2 + pix2+1 0..7
    > +        uaddl2          v3.8h, v4.16b, v3.16b       // pix2 + pix2+1 8..15
    > +        cmp             w4, #4                      // if h < 4 jump to the completion version
    > +        b.lt            2f
    > +1:
    > +        // This is an unrolled implemntation. It completes 4 iterations of the C for each branch.
    > +        // In each iteration, pix2[i+1] == pix3[i]. This means we need only three loads per iteration,
    > +        // plus two at the begining to start.
    > +        ldr             q5, [x5, #1]                // load pix3+1
    > +        ld1             {v4.16b}, [x5], x3          // load pix3
    > +        ld1             {v1.16b}, [x1], x3          // load pix1
    > +
    > +        ldr             q7, [x5, #1]                // load pix3+1
    > +        ld1             {v6.16b}, [x5], x3          // load pix3
    > +        ld1             {v16.16b}, [x1], x3         // load pix1
    > +
    > +        ldr             q19, [x5, #1]               // load pix3+1
    > +        ld1             {v18.16b}, [x5], x3         // load pix3
    > +        ld1             {v17.16b}, [x1], x3         // load pix1
    > +
    > +        ldr             q22, [x5, #1]               // load pix3+1
    > +        ld1             {v21.16b}, [x5], x3         // load pix3
    > +        ld1             {v20.16b}, [x1], x3         // load pix1
    > +
    > +        // These blocks compute the average: avg(pix2[n], pix2[n+1], pix3[n], pix3[n+1])
    > +        uaddl           v30.8h, v4.8b, v5.8b        // pix3 + pix3+1 0..7
    > +        uaddl2          v31.8h, v4.16b, v5.16b      // pix3 + pix3+1 8..15
    > +        add             v23.8h, v2.8h, v30.8h       // add up 0..7, using pix2 + pix2+1 values from previous iteration
    > +        add             v24.8h, v3.8h, v31.8h       // add up 8..15, using pix2 + pix2+1 values from previous iteration
    > +        urshr           v23.8h, v23.8h, #2          // shift right 2 0..7 (rounding shift right)
    > +        urshr           v24.8h, v24.8h, #2          // shift right 2 8..15
    > +
    > +        uaddl           v2.8h, v6.8b, v7.8b         // pix3 + pix3+1 0..7
    > +        uaddl2          v3.8h, v6.16b, v7.16b       // pix3 + pix3+1 8..15
    > +        add             v26.8h, v30.8h, v2.8h       // add up 0..7, using pix2 + pix2+1 values from pix3 above
    > +        add             v27.8h, v31.8h, v3.8h       // add up 8..15, using pix2 + pix2+1 values from pix3 above
    > +        urshr           v26.8h, v26.8h, #2          // shift right 2 0..7 (rounding shift right)
    > +        urshr           v27.8h, v27.8h, #2          // shift right 2 8..15
    > +
    > +        uaddl           v4.8h, v18.8b, v19.8b       // pix3 + pix3+1 0..7
    > +        uaddl2          v5.8h, v18.16b, v19.16b     // pix3 + pix3+1 8..15
    > +        add             v28.8h, v2.8h, v4.8h        // add up 0..7, using pix2 + pix2+1 values from pix3 above
    > +        add             v29.8h, v3.8h, v5.8h        // add up 8..15, using pix2 + pix2+1 values from pix3 above
    > +        urshr           v28.8h, v28.8h, #2          // shift right 2 0..7 (rounding shift right)
    > +        urshr           v29.8h, v29.8h, #2          // shift right 2 8..15
    > +
    > +        uaddl           v2.8h, v21.8b, v22.8b       // pix3 + pix3+1 0..7
    > +        uaddl2          v3.8h, v21.16b, v22.16b     // pix3 + pix3+1 8..15
    > +        add             v30.8h, v4.8h, v2.8h        // add up 0..7, using pix2 + pix2+1 values from pix3 above
    > +        add             v31.8h, v5.8h, v3.8h        // add up 8..15, using pix2 + pix2+1 values from pix3 above
    > +        urshr           v30.8h, v30.8h, #2          // shift right 2 0..7 (rounding shift right)
    > +        urshr           v31.8h, v31.8h, #2          // shift right 2 8..15
    > +
    > +        // Averages are now stored in these registers:
    > +        // v23, v24
    > +        // v26, v27
    > +        // v28, v29
    > +        // v30, v31
    > +        // pix1 values in these registers:
    > +        // v1, v16, v17, v20
    > +        // available
    > +        // v4, v5, v7, v16, v18, v19, v25
    > +
    > +        uxtl2           v4.8h, v1.16b               // 8->16 bits pix1 8..15
    > +        uxtl            v1.8h, v1.8b                // 8->16 bits pix1 0..7
    > +        uxtl2           v7.8h, v16.16b              // 8->16 bits pix1 8..15
    > +        uxtl            v6.8h, v16.8b               // 8->16 bits pix1 0..7
    > +        uxtl2           v18.8h, v17.16b             // 8->16 bits pix1 8..15
    > +        uxtl            v17.8h, v17.8b              // 8->16 bits pix1 0..7
    > +        uxtl2           v25.8h, v20.16b             // 8->16 bits pix1 8..15
    > +        uxtl            v20.8h, v20.8b              // 8->16 bits pix1 0..7

    Instead of widening the pix1 values here, would it be possible to narrow
    the averages from pix2/pix3 by using rshrn instead of urshr above? Then
    you'd end up needing uabdl/uabal here below (resulting in the same number
    of instructions), but you could possibly get rid of all the uxtl above.

    > +        uabd            v5.8h, v1.8h, v23.8h        // absolute difference 0..7
    > +        uaba            v5.8h, v4.8h, v24.8h        // absolute difference accumulate 8..15
    > +        uaba            v5.8h, v6.8h, v26.8h        // absolute difference accumulate 0..7
    > +        uaba            v5.8h, v7.8h, v27.8h        // absolute difference accumulate 8..15
    > +        uaba            v5.8h, v17.8h, v28.8h       // absolute difference accumulate 0..7
    > +        uaba            v5.8h, v18.8h, v29.8h       // absolute difference accumulate 8..15
    > +        uaba            v5.8h, v20.8h, v30.8h       // absolute difference accumulate 0..7
    > +        uaba            v5.8h, v25.8h, v31.8h       // absolute difference accumulate 8..15
    > +
    > +        uaddlv          s5, v5.8h                   // add up accumulated values
    > +        sub             w4, w4, #4                  // h -= 4
    > +        add             d0, d0, d5                  // add to final result
    > +        cmp             w4, #4                      // loop if h >= 4
    > +        b.ge            1b

    Please move the sub and cmp further up, so that the cmp doesn't directly
    precede the branch that depends on its result.

    > +        cbnz            w4, 2f                      // if iterations remain jump to completion section
    > +
    > +        fmov            w0, s0                      // copy result to general purpose register
    > +        ret
    > +2:
    > +        // v2 and v3 are set either at the end of this loop or at from the unrolled version
    > +        // which branches here to complete iterations when h % 4 != 0.
    > +        ldr             q5, [x5, #1]                // load pix3+1
    > +        ld1             {v4.16b}, [x5], x3          // load pix3
    > +        ld1             {v1.16b}, [x1], x3          // load pix1
    > +        sub             w4, w4, #1                  // decrement h
    > +
    > +        uaddl           v18.8h, v4.8b, v5.8b        // pix3 + pix3+1 0..7
    > +        uaddl2          v19.8h, v4.16b, v5.16b      // pix3 + pix3+1 8..15
    > +        add             v16.8h, v2.8h, v18.8h       // add up 0..7, using pix2 + pix2+1 values from previous iteration
    > +        add             v17.8h, v3.8h, v19.8h       // add up 8..15, using pix2 + pix2+1 values from previous iteration
    > +        // divide by 4 to compute the average of values summed above
    > +        urshr           v16.8h, v16.8h, #2          // shift right by 2 0..7 (rounding shift right)
    > +        urshr           v17.8h, v17.8h, #2          // shift right by 2 8..15
    > +
    > +        uxtl2           v8.8h, v1.16b               // 8->16 bits pix1 8..15
    > +        uxtl            v1.8h, v1.8b                // 8->16 bits pix1 0..7
    > +
    > +        uabd            v6.8h, v1.8h, v16.8h        // absolute difference 0..7
    > +        uaba            v6.8h, v8.8h, v17.8h        // absolute difference accumulate 8..15
    > +        mov             v2.16b, v18.16b             // pix3 -> pix2
    > +        mov             v3.16b, v19.16b             // pix3+1 -> pix2+1
    > +        addv            h6, v6.8h                   // add up accumulator in v6
    > +        add             d0, d0, d6                  // add to the final result
    > +
    > +        cbnz            w4, 2b                      // loop if h > 0

    Since we already do sub above, I think it'd be faster and/or more
    idiomatic by making that "subs" and changing this into "b.ne 2b".

    > +        fmov            w0, s0                      // copy result to general purpose register
    > +        ret
    > +endfunc

    > diff --git a/tests/checkasm/motion.c b/tests/checkasm/motion.c
    > new file mode 100644
    > index 0000000000..c2b4421325
    > --- /dev/null
    > +++ b/tests/checkasm/motion.c
    > @@ -0,0 +1,144 @@
    > +
    > +#include <string.h>
    > +
    > +#include "libavutil/common.h"
    > +#include "libavutil/intreadwrite.h"
    > +#include "libavutil/mem_internal.h"
    > +
    > +#include "libavcodec/me_cmp.h"
    > +
    > +#include "checkasm.h"
    > +
    > +#define WIDTH 64
    > +#define HEIGHT 64
    > +
    > +static void fill_random(uint8_t *tab, int size)
    > +{
    > +    int i;
    > +    for(i=0;i<size;i++) {

    Spacing around parentheses and operators

    > +        tab[i] = rnd() % 256;
    > +    }
    > +}
    > +
    > +static void test_motion(const char *name, me_cmp_func test_func)
    > +{
    > +    int x, y, d1, d2;
    > +    uint8_t *ptr;
    > +
    > +    LOCAL_ALIGNED_8(uint8_t, img1, [WIDTH * HEIGHT]);
    > +    LOCAL_ALIGNED_8(uint8_t, img2, [WIDTH * HEIGHT]);
    > +
    > +    declare_func_emms(AV_CPU_FLAG_MMX, int, struct MpegEncContext *c,
    > +                      uint8_t *blk1 /* align width (8 or 16) */,
    > +                      uint8_t *blk2 /* align 1 */, ptrdiff_t stride,
    > +                      int h);
    > +
    > +    if (test_func == NULL) {
    > +        return;
    > +    }
    > +
    > +    /* test correctness */
    > +    fill_random(img1, WIDTH * HEIGHT);
    > +    fill_random(img2, WIDTH * HEIGHT);
    > +
    > +    if (check_func(test_func, "%s", name)) {
    > +        for (y = 0; y < HEIGHT - 17; y++) {
    > +            for (x = 0; x < WIDTH - 17; x++) {
    > +                ptr = img2 + y * WIDTH + x;

    I'm a bit unsure why we need to do 47x47 iterations of this, just to test
    various random offsets. The normal way would be to do e.g. one or a couple
    iterations with "x = rnd() % (WIDTH - 17)" instead.

    > +                d2 = call_ref(NULL, img1, ptr, WIDTH, 8);
    > +                d1 = call_new(NULL, img1, ptr, WIDTH, 8);
    > +
    > +                if (d1 != d2) {
    > +                    fail();
    > +                    printf("func: %s, x=%d y=%d, error: asm=%d c=%d\n", name, x, y, d1, d2);
    > +                    break;
    > +                }
    > +            }
    > +        }
    > +        // benchmark with the final value of ptr
    > +        bench_new(NULL, img1, ptr, WIDTH, 8);
    > +    }
    > +}
    > +
    > +#define ME_CMP_1D_ARRAYS(XX)                                                   \
    > +    XX(sad)                                                                    \
    > +    XX(sse)                                                                    \
    > +    XX(hadamard8_diff)                                                         \
    > +    XX(vsad)                                                                   \
    > +    XX(vsse)                                                                   \
    > +    XX(nsse)                                                                   \
    > +    XX(me_pre_cmp)                                                             \
    > +    XX(me_cmp)                                                                 \
    > +    XX(me_sub_cmp)                                                             \
    > +    XX(mb_cmp)                                                                 \
    > +    XX(ildct_cmp)                                                              \
    > +    XX(frame_skip_cmp)                                                         \
    > +    XX(median_sad)
    > +
    > +// tests for functions not yet implemented
    > +#if 0
    > +    XX(dct_sad)                                                                \
    > +    XX(quant_psnr)                                                             \
    > +    XX(bit)                                                                    \
    > +    XX(rd)                                                                     \
    > +    XX(w53)                                                                    \
    > +    XX(w97)                                                                    \
    > +    XX(dct_max)                                                                \
    > +    XX(dct264_sad)                                                             \
    > +
    > +#endif
    > +
    > +static void check_motion(void)
    > +{
    > +    char buf[64];
    > +    AVCodecContext *av_ctx;
    > +    MECmpContext me_ctx;
    > +
    > +    memset(&me_ctx, 0, sizeof(me_ctx));
    > +
    > +    /* allocate AVCodecContext */
    > +    av_ctx = avcodec_alloc_context3(NULL);
    > +    av_ctx->flags |= AV_CODEC_FLAG_BITEXACT;
    > +
    > +    ff_me_cmp_init(&me_ctx, av_ctx);
    > +
    > +    for (int i = 0; i < FF_ARRAY_ELEMS(me_ctx.pix_abs); i++) {
    > +        for (int j = 0; j < FF_ARRAY_ELEMS(me_ctx.pix_abs[0]); j++) {
    > +            snprintf(buf, sizeof(buf), "pix_abs_%d_%d", i, j);
    > +            test_motion(buf, me_ctx.pix_abs[i][j]);
    > +        }
    > +    }
    > +
    > +#define XX(me_cmp_array)                                                        \
    > +    for (int i = 0; i < FF_ARRAY_ELEMS(me_ctx.me_cmp_array); i++) {               \
    > +        snprintf(buf, sizeof(buf), #me_cmp_array "_%d", i);                     \
    > +        test_motion(buf, me_ctx.me_cmp_array[i]);                               \
    > +    }

    The backslashes are uneven here (or there's mixed whitespace types?)

    > +    ME_CMP_1D_ARRAYS(XX)
    > +#undef XX
    > +

    It'd be good to free av_ctx here too.


    Other than that, this looks good, thanks!

    // Martin

    _______________________________________________
    ffmpeg-devel mailing list
    ffmpeg-devel@ffmpeg.org
    https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

    To unsubscribe, visit link above, or email
    ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
diff mbox series

Patch

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index c8935f205e..9ce21566c6 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -7,6 +7,7 @@  OBJS-$(CONFIG_H264PRED)                 += aarch64/h264pred_init.o
 OBJS-$(CONFIG_H264QPEL)                 += aarch64/h264qpel_init_aarch64.o
 OBJS-$(CONFIG_HPELDSP)                  += aarch64/hpeldsp_init_aarch64.o
 OBJS-$(CONFIG_IDCTDSP)                  += aarch64/idctdsp_init_aarch64.o
+OBJS-$(CONFIG_ME_CMP)                   += aarch64/me_cmp_init_aarch64.o
 OBJS-$(CONFIG_MPEGAUDIODSP)             += aarch64/mpegaudiodsp_init.o
 OBJS-$(CONFIG_NEON_CLOBBER_TEST)        += aarch64/neontest.o
 OBJS-$(CONFIG_PIXBLOCKDSP)              += aarch64/pixblockdsp_init_aarch64.o
@@ -47,6 +48,7 @@  NEON-OBJS-$(CONFIG_HPELDSP)             += aarch64/hpeldsp_neon.o
 NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/idctdsp_neon.o              \
                                            aarch64/simple_idct_neon.o
 NEON-OBJS-$(CONFIG_MDCT)                += aarch64/mdct_neon.o
+NEON-OBJS-$(CONFIG_ME_CMP)              += aarch64/me_cmp_neon.o
 NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
 NEON-OBJS-$(CONFIG_PIXBLOCKDSP)         += aarch64/pixblockdsp_neon.o
 NEON-OBJS-$(CONFIG_VC1DSP)              += aarch64/vc1dsp_neon.o
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
new file mode 100644
index 0000000000..9fb63e9973
--- /dev/null
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -0,0 +1,39 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/mpegvideo.h"
+
+int ff_pix_abs16_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
+                      ptrdiff_t stride, int h);
+int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
+                      ptrdiff_t stride, int h);
+
+av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        c->pix_abs[0][0] = ff_pix_abs16_neon;
+        c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
+    }
+}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
new file mode 100644
index 0000000000..3b48cb156d
--- /dev/null
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -0,0 +1,209 @@ 
+/*
+ * Copyright (c) 2022 Jonathan Swinney <jswinney@amazon.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_pix_abs16_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // w4           int h
+        // x5           uint8_t *pix3
+        cmp             w4, #4                      // if h < 4, jump to completion section
+        movi            v18.4S, #0                  // clear result accumulator
+        b.lt            2f
+1:
+        movi            v16.8h, #0                  // clear uabal accumulator
+        ld1             {v0.16b}, [x1], x3          // load pix1
+        ld1             {v4.16b}, [x2], x3          // load pix2
+        ld1             {v1.16b}, [x1], x3          // load pix1
+        ld1             {v5.16b}, [x2], x3          // load pix2
+        uabal           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
+        uabal2          v16.8h, v0.16b, v4.16b
+        ld1             {v2.16b}, [x1], x3          // load pix1
+        ld1             {v6.16b}, [x2], x3          // load pix2
+        uabal           v16.8h, v1.8b, v5.8b        // absolute difference accumulate
+        uabal2          v16.8h, v1.16b, v5.16b
+        ld1             {v3.16b}, [x1], x3
+        ld1             {v7.16b}, [x2], x3
+        uabal           v16.8h, v2.8b, v6.8b
+        uabal2          v16.8h, v2.16b, v6.16b
+        sub             w4, w4, #4                  // h -= 4
+        uabal           v16.8h, v3.8b, v7.8b
+        uabal2          v16.8h, v3.16b, v7.16b
+        cmp             w4, #4                      // if h >= 4, loop
+        uaddlv          s17, v16.8h                 // add up everything in v16 accumulator
+        add             d18, d17, d18               // add to the end result register
+
+        b.ge            1b
+        cbnz            w4, 2f                      // if iterations remain, jump to completion section
+
+        fmov            w0, s18                     // copy result to general purpose register
+        ret
+
+2:
+        movi            v16.8h, #0                  // clear the uabal accumulator
+        ld1             {v0.16b}, [x1], x3          // load pix1
+        ld1             {v4.16b}, [x2], x3          // load pix2
+        uabal           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
+        uabal2          v16.8h, v0.16b, v4.16b
+        addv            h17, v16.8h                 // add up v16
+        add             d18, d17, d18               // add to result
+        subs            w4, w4, #1                  // h -= 1
+        b.ne            2b
+
+        fmov            w0, s18                     // copy result to general purpose register
+        ret
+endfunc
+
+function ff_pix_abs16_xy2_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // w4           int h
+        // x5           uint8_t *pix3
+        add             x5, x2, x3                  // create a pointer for pix3
+        movi            v0.2d, #0                   // initialize the result register
+
+        // Load initial pix2 values for either the unrolled version or completion version.
+        ldr             q4, [x2, #1]                // load pix2+1
+        ldr             q3, [x2]                    // load pix2
+        uaddl           v2.8h, v4.8b, v3.8b         // pix2 + pix2+1 0..7
+        uaddl2          v3.8h, v4.16b, v3.16b       // pix2 + pix2+1 8..15
+        cmp             w4, #4                      // if h < 4 jump to the completion version
+        b.lt            2f
+1:
+        // This is an unrolled implemntation. It completes 4 iterations of the C for each branch.
+        // In each iteration, pix2[i+1] == pix3[i]. This means we need only three loads per iteration,
+        // plus two at the begining to start.
+        ldr             q5, [x5, #1]                // load pix3+1
+        ld1             {v4.16b}, [x5], x3          // load pix3
+        ld1             {v1.16b}, [x1], x3          // load pix1
+
+        ldr             q7, [x5, #1]                // load pix3+1
+        ld1             {v6.16b}, [x5], x3          // load pix3
+        ld1             {v16.16b}, [x1], x3         // load pix1
+
+        ldr             q19, [x5, #1]               // load pix3+1
+        ld1             {v18.16b}, [x5], x3         // load pix3
+        ld1             {v17.16b}, [x1], x3         // load pix1
+
+        ldr             q22, [x5, #1]               // load pix3+1
+        ld1             {v21.16b}, [x5], x3         // load pix3
+        ld1             {v20.16b}, [x1], x3         // load pix1
+
+        // These blocks compute the average: avg(pix2[n], pix2[n+1], pix3[n], pix3[n+1])
+        uaddl           v30.8h, v4.8b, v5.8b        // pix3 + pix3+1 0..7
+        uaddl2          v31.8h, v4.16b, v5.16b      // pix3 + pix3+1 8..15
+        add             v23.8h, v2.8h, v30.8h       // add up 0..7, using pix2 + pix2+1 values from previous iteration
+        add             v24.8h, v3.8h, v31.8h       // add up 8..15, using pix2 + pix2+1 values from previous iteration
+        urshr           v23.8h, v23.8h, #2          // shift right 2 0..7 (rounding shift right)
+        urshr           v24.8h, v24.8h, #2          // shift right 2 8..15
+
+        uaddl           v2.8h, v6.8b, v7.8b         // pix3 + pix3+1 0..7
+        uaddl2          v3.8h, v6.16b, v7.16b       // pix3 + pix3+1 8..15
+        add             v26.8h, v30.8h, v2.8h       // add up 0..7, using pix2 + pix2+1 values from pix3 above
+        add             v27.8h, v31.8h, v3.8h       // add up 8..15, using pix2 + pix2+1 values from pix3 above
+        urshr           v26.8h, v26.8h, #2          // shift right 2 0..7 (rounding shift right)
+        urshr           v27.8h, v27.8h, #2          // shift right 2 8..15
+
+        uaddl           v4.8h, v18.8b, v19.8b       // pix3 + pix3+1 0..7
+        uaddl2          v5.8h, v18.16b, v19.16b     // pix3 + pix3+1 8..15
+        add             v28.8h, v2.8h, v4.8h        // add up 0..7, using pix2 + pix2+1 values from pix3 above
+        add             v29.8h, v3.8h, v5.8h        // add up 8..15, using pix2 + pix2+1 values from pix3 above
+        urshr           v28.8h, v28.8h, #2          // shift right 2 0..7 (rounding shift right)
+        urshr           v29.8h, v29.8h, #2          // shift right 2 8..15
+
+        uaddl           v2.8h, v21.8b, v22.8b       // pix3 + pix3+1 0..7
+        uaddl2          v3.8h, v21.16b, v22.16b     // pix3 + pix3+1 8..15
+        add             v30.8h, v4.8h, v2.8h        // add up 0..7, using pix2 + pix2+1 values from pix3 above
+        add             v31.8h, v5.8h, v3.8h        // add up 8..15, using pix2 + pix2+1 values from pix3 above
+        urshr           v30.8h, v30.8h, #2          // shift right 2 0..7 (rounding shift right)
+        urshr           v31.8h, v31.8h, #2          // shift right 2 8..15
+
+        // Averages are now stored in these registers:
+        // v23, v24
+        // v26, v27
+        // v28, v29
+        // v30, v31
+        // pix1 values in these registers:
+        // v1, v16, v17, v20
+        // available
+        // v4, v5, v7, v16, v18, v19, v25
+
+        uxtl2           v4.8h, v1.16b               // 8->16 bits pix1 8..15
+        uxtl            v1.8h, v1.8b                // 8->16 bits pix1 0..7
+        uxtl2           v7.8h, v16.16b              // 8->16 bits pix1 8..15
+        uxtl            v6.8h, v16.8b               // 8->16 bits pix1 0..7
+        uxtl2           v18.8h, v17.16b             // 8->16 bits pix1 8..15
+        uxtl            v17.8h, v17.8b              // 8->16 bits pix1 0..7
+        uxtl2           v25.8h, v20.16b             // 8->16 bits pix1 8..15
+        uxtl            v20.8h, v20.8b              // 8->16 bits pix1 0..7
+
+        uabd            v5.8h, v1.8h, v23.8h        // absolute difference 0..7
+        uaba            v5.8h, v4.8h, v24.8h        // absolute difference accumulate 8..15
+        uaba            v5.8h, v6.8h, v26.8h        // absolute difference accumulate 0..7
+        uaba            v5.8h, v7.8h, v27.8h        // absolute difference accumulate 8..15
+        uaba            v5.8h, v17.8h, v28.8h       // absolute difference accumulate 0..7
+        uaba            v5.8h, v18.8h, v29.8h       // absolute difference accumulate 8..15
+        uaba            v5.8h, v20.8h, v30.8h       // absolute difference accumulate 0..7
+        uaba            v5.8h, v25.8h, v31.8h       // absolute difference accumulate 8..15
+
+        uaddlv          s5, v5.8h                   // add up accumulated values
+        sub             w4, w4, #4                  // h -= 4
+        add             d0, d0, d5                  // add to final result
+        cmp             w4, #4                      // loop if h >= 4
+        b.ge            1b
+        cbnz            w4, 2f                      // if iterations remain jump to completion section
+
+        fmov            w0, s0                      // copy result to general purpose register
+        ret
+2:
+        // v2 and v3 are set either at the end of this loop or at from the unrolled version
+        // which branches here to complete iterations when h % 4 != 0.
+        ldr             q5, [x5, #1]                // load pix3+1
+        ld1             {v4.16b}, [x5], x3          // load pix3
+        ld1             {v1.16b}, [x1], x3          // load pix1
+        sub             w4, w4, #1                  // decrement h
+
+        uaddl           v18.8h, v4.8b, v5.8b        // pix3 + pix3+1 0..7
+        uaddl2          v19.8h, v4.16b, v5.16b      // pix3 + pix3+1 8..15
+        add             v16.8h, v2.8h, v18.8h       // add up 0..7, using pix2 + pix2+1 values from previous iteration
+        add             v17.8h, v3.8h, v19.8h       // add up 8..15, using pix2 + pix2+1 values from previous iteration
+        // divide by 4 to compute the average of values summed above
+        urshr           v16.8h, v16.8h, #2          // shift right by 2 0..7 (rounding shift right)
+        urshr           v17.8h, v17.8h, #2          // shift right by 2 8..15
+
+        uxtl2           v8.8h, v1.16b               // 8->16 bits pix1 8..15
+        uxtl            v1.8h, v1.8b                // 8->16 bits pix1 0..7
+
+        uabd            v6.8h, v1.8h, v16.8h        // absolute difference 0..7
+        uaba            v6.8h, v8.8h, v17.8h        // absolute difference accumulate 8..15
+        mov             v2.16b, v18.16b             // pix3 -> pix2
+        mov             v3.16b, v19.16b             // pix3+1 -> pix2+1
+        addv            h6, v6.8h                   // add up accumulator in v6
+        add             d0, d0, d6                  // add to the final result
+
+        cbnz            w4, 2b                      // loop if h > 0
+        fmov            w0, s0                      // copy result to general purpose register
+        ret
+endfunc
diff --git a/libavcodec/me_cmp.c b/libavcodec/me_cmp.c
index 4407d0a7e9..d0cd14ad33 100644
--- a/libavcodec/me_cmp.c
+++ b/libavcodec/me_cmp.c
@@ -1061,7 +1061,9 @@  av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
     ff_dsputil_init_dwt(c);
 #endif
 
-#if ARCH_ALPHA
+#if ARCH_AARCH64
+    ff_me_cmp_init_aarch64(c, avctx);
+#elif ARCH_ALPHA
     ff_me_cmp_init_alpha(c, avctx);
 #elif ARCH_ARM
     ff_me_cmp_init_arm(c, avctx);
diff --git a/libavcodec/me_cmp.h b/libavcodec/me_cmp.h
index e9b5161c9a..7b057a923b 100644
--- a/libavcodec/me_cmp.h
+++ b/libavcodec/me_cmp.h
@@ -80,6 +80,7 @@  typedef struct MECmpContext {
 } MECmpContext;
 
 void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx);
+void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx);
diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index f6b1008855..e869c70b55 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -12,6 +12,7 @@  AVCODECOBJS-$(CONFIG_H264QPEL)          += h264qpel.o
 AVCODECOBJS-$(CONFIG_IDCTDSP)           += idctdsp.o
 AVCODECOBJS-$(CONFIG_LLVIDDSP)          += llviddsp.o
 AVCODECOBJS-$(CONFIG_LLVIDENCDSP)       += llviddspenc.o
+AVCODECOBJS-$(CONFIG_ME_CMP)            += motion.o
 AVCODECOBJS-$(CONFIG_VC1DSP)            += vc1dsp.o
 AVCODECOBJS-$(CONFIG_VP8DSP)            += vp8dsp.o
 AVCODECOBJS-$(CONFIG_VIDEODSP)          += videodsp.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 57134f96ea..5ffcafbda9 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -135,6 +135,9 @@  static const struct {
     #if CONFIG_LLVIDENCDSP
         { "llviddspenc", checkasm_check_llviddspenc },
     #endif
+    #if CONFIG_ME_CMP
+        { "motion", checkasm_check_motion },
+    #endif
     #if CONFIG_OPUS_DECODER
         { "opusdsp", checkasm_check_opusdsp },
     #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index a86db140e3..b601a98754 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -68,6 +68,7 @@  void checkasm_check_idctdsp(void);
 void checkasm_check_jpeg2000dsp(void);
 void checkasm_check_llviddsp(void);
 void checkasm_check_llviddspenc(void);
+void checkasm_check_motion(void);
 void checkasm_check_nlmeans(void);
 void checkasm_check_opusdsp(void);
 void checkasm_check_pixblockdsp(void);
diff --git a/tests/checkasm/motion.c b/tests/checkasm/motion.c
new file mode 100644
index 0000000000..c2b4421325
--- /dev/null
+++ b/tests/checkasm/motion.c
@@ -0,0 +1,144 @@ 
+/*
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+
+#include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+#include "libavcodec/me_cmp.h"
+
+#include "checkasm.h"
+
+#define WIDTH 64
+#define HEIGHT 64
+
+static void fill_random(uint8_t *tab, int size)
+{
+    int i;
+    for(i=0;i<size;i++) {
+        tab[i] = rnd() % 256;
+    }
+}
+
+static void test_motion(const char *name, me_cmp_func test_func)
+{
+    int x, y, d1, d2;
+    uint8_t *ptr;
+
+    LOCAL_ALIGNED_8(uint8_t, img1, [WIDTH * HEIGHT]);
+    LOCAL_ALIGNED_8(uint8_t, img2, [WIDTH * HEIGHT]);
+
+    declare_func_emms(AV_CPU_FLAG_MMX, int, struct MpegEncContext *c,
+                      uint8_t *blk1 /* align width (8 or 16) */,
+                      uint8_t *blk2 /* align 1 */, ptrdiff_t stride,
+                      int h);
+
+    if (test_func == NULL) {
+        return;
+    }
+
+    /* test correctness */
+    fill_random(img1, WIDTH * HEIGHT);
+    fill_random(img2, WIDTH * HEIGHT);
+
+    if (check_func(test_func, "%s", name)) {
+        for (y = 0; y < HEIGHT - 17; y++) {
+            for (x = 0; x < WIDTH - 17; x++) {
+                ptr = img2 + y * WIDTH + x;
+                d2 = call_ref(NULL, img1, ptr, WIDTH, 8);
+                d1 = call_new(NULL, img1, ptr, WIDTH, 8);
+
+                if (d1 != d2) {
+                    fail();
+                    printf("func: %s, x=%d y=%d, error: asm=%d c=%d\n", name, x, y, d1, d2);
+                    break;
+                }
+            }
+        }
+        // benchmark with the final value of ptr
+        bench_new(NULL, img1, ptr, WIDTH, 8);
+    }
+}
+
+#define ME_CMP_1D_ARRAYS(XX)                                                   \
+    XX(sad)                                                                    \
+    XX(sse)                                                                    \
+    XX(hadamard8_diff)                                                         \
+    XX(vsad)                                                                   \
+    XX(vsse)                                                                   \
+    XX(nsse)                                                                   \
+    XX(me_pre_cmp)                                                             \
+    XX(me_cmp)                                                                 \
+    XX(me_sub_cmp)                                                             \
+    XX(mb_cmp)                                                                 \
+    XX(ildct_cmp)                                                              \
+    XX(frame_skip_cmp)                                                         \
+    XX(median_sad)
+
+// tests for functions not yet implemented
+#if 0
+    XX(dct_sad)                                                                \
+    XX(quant_psnr)                                                             \
+    XX(bit)                                                                    \
+    XX(rd)                                                                     \
+    XX(w53)                                                                    \
+    XX(w97)                                                                    \
+    XX(dct_max)                                                                \
+    XX(dct264_sad)                                                             \
+
+#endif
+
+static void check_motion(void)
+{
+    char buf[64];
+    AVCodecContext *av_ctx;
+    MECmpContext me_ctx;
+
+    memset(&me_ctx, 0, sizeof(me_ctx));
+
+    /* allocate AVCodecContext */
+    av_ctx = avcodec_alloc_context3(NULL);
+    av_ctx->flags |= AV_CODEC_FLAG_BITEXACT;
+
+    ff_me_cmp_init(&me_ctx, av_ctx);
+
+    for (int i = 0; i < FF_ARRAY_ELEMS(me_ctx.pix_abs); i++) {
+        for (int j = 0; j < FF_ARRAY_ELEMS(me_ctx.pix_abs[0]); j++) {
+            snprintf(buf, sizeof(buf), "pix_abs_%d_%d", i, j);
+            test_motion(buf, me_ctx.pix_abs[i][j]);
+        }
+    }
+
+#define XX(me_cmp_array)                                                        \
+    for (int i = 0; i < FF_ARRAY_ELEMS(me_ctx.me_cmp_array); i++) {               \
+        snprintf(buf, sizeof(buf), #me_cmp_array "_%d", i);                     \
+        test_motion(buf, me_ctx.me_cmp_array[i]);                               \
+    }
+    ME_CMP_1D_ARRAYS(XX)
+#undef XX
+
+}
+
+void checkasm_check_motion(void)
+{
+    check_motion();
+    report("motion");
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index c6273db183..4d2f321e84 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -23,6 +23,7 @@  FATE_CHECKASM = fate-checkasm-aacpsdsp                                  \
                 fate-checkasm-jpeg2000dsp                               \
                 fate-checkasm-llviddsp                                  \
                 fate-checkasm-llviddspenc                               \
+                fate-checkasm-motion                                    \
                 fate-checkasm-opusdsp                                   \
                 fate-checkasm-pixblockdsp                               \
                 fate-checkasm-sbrdsp                                    \