@@ -7,6 +7,7 @@ OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o
OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o
OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o
OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_init_aarch64.o
+OBJS-$(CONFIG_ME_CMP) += aarch64/me_cmp_init_aarch64.o
OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o
OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_init_aarch64.o
@@ -46,6 +47,7 @@ NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \
NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o
NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/simple_idct_neon.o
NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o
+NEON-OBJS-$(CONFIG_ME_CMP) += aarch64/me_cmp_neon.o
NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_neon.o
NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o
new file mode 100644
@@ -0,0 +1,39 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/mpegvideo.h"
+
+int ff_pix_abs16_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
+ ptrdiff_t stride, int h);
+int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
+ ptrdiff_t stride, int h);
+
+av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ c->pix_abs[0][0] = ff_pix_abs16_neon;
+ c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
+ }
+}
new file mode 100644
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2022 Jonathan Swinney <jswinney@amazon.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+function ff_pix_abs16_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // w4 int h
+ // x5 uint8_t *pix3
+ cmp w4, #4 // if h < 4, jump to completion section
+ b.lt 2f
+ movi v18.4S, #0 // clear result accumulator
+1:
+ movi v16.8H, #0 // clear uabal accumulator
+ ld1 {v0.16B}, [x1], x3 // load pix1
+ ld1 {v4.16B}, [x2], x3 // load pix2
+ ld1 {v1.16B}, [x1], x3 // load pix1
+ ld1 {v5.16B}, [x2], x3 // load pix2
+ uabal v16.8H, v0.8B, v4.8B // absolute difference accumulate
+ uabal2 v16.8H, v0.16B, v4.16B
+ ld1 {v2.16B}, [x1], x3 // load pix1
+ ld1 {v6.16B}, [x2], x3 // load pix2
+ uabal v16.8H, v1.8B, v5.8B // absolute difference accumulate
+ uabal2 v16.8H, v1.16B, v5.16B
+ ld1 {v3.16B}, [x1], x3
+ ld1 {v7.16B}, [x2], x3
+ uabal v16.8H, v2.8B, v6.8B
+ uabal2 v16.8H, v2.16B, v6.16B
+ sub w4, w4, #4 // h -= 4
+ uabal v16.8H, v3.8B, v7.8B
+ uabal2 v16.8H, v3.16B, v7.16B
+ cmp w4, #4 // if h >= 4, loop
+ addv h17, v16.8H // add up everything in v16 accumulator
+ add d18, d17, d18 // add to the end result register
+
+ b.ge 1b
+ cbnz w4, 2f // if iterations remain, jump to completion section
+
+ mov w0, v18.S[0] // copy result to general purpose register
+ ret
+
+2:
+ movi v16.8H, #0 // clear the uabal accumulator
+ ld1 {v0.16B}, [x1] // load pix1
+ ld1 {v4.16B}, [x2] // load pix2
+ add x1, x1, x3 // increment pointers
+ add x2, x2, x3
+ uabal v16.8H, v0.8B, v4.8B // absolute difference accumulate
+ uabal2 v16.8H, v0.16B, v4.16B
+ addv h17, v16.8H // add up v16
+ add d18, d17, d18 // add to result
+ subs w4, w4, #1 // h -= 1
+ b.ne 2b
+
+ mov w0, v18.S[0] // copy result to general purpose register
+ ret
+endfunc
+
+function ff_pix_abs16_xy2_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // w4 int h
+ // x5 uint8_t *pix3
+ add x5, x2, x3 // create a pointer for pix3
+ movi v0.2D, #0 // initialize the result register
+
+ // I also tested these intructions to get pix2+1 from pix2, but it wasn't faster
+ // than just doing another full (unaligned) load.
+ // ldr b21, [x5, #16]
+ // ushr v4.2D, v2.2D, #8
+ // mov v4.16B[15], v21.16B[0]
+ // mov v4.16B[7], v2.16B[8]
+
+ // Load initial pix2 values for either the unrolled version of completion version.
+ ldr q4, [x2, #1] // load pix2+1
+ ldr q2, [x2] // load pix2
+ cmp w4, #4 // if h < 4 jump to the completion version
+ b.lt 2f
+1:
+ // This is an unrolled implemntation. It completes 4 iterations of the C for each branch.
+ // In each iteration, pix2[i+1] == pix3[i]. This means we need only three loads per iteration,
+ // plus two at the begining to start.
+ ldr q5, [x5, #1] // load pix3+1
+ ld1 {v3.16B}, [x5], x3 // load pix3
+ ld1 {v1.16B}, [x1], x3 // load pix1
+
+ ldr q16, [x5, #1] // load pix3+1
+ ld1 {v7.16B}, [x5], x3 // load pix3
+ ld1 {v6.16B}, [x1], x3 // load pix1
+
+ ldr q19, [x5, #1] // load pix3+1
+ ld1 {v18.16B}, [x5], x3 // load pix3
+ ld1 {v17.16B}, [x1], x3 // load pix1
+
+ ldr q22, [x5, #1] // load pix3+1
+ ld1 {v21.16B}, [x5], x3 // load pix3
+ ld1 {v20.16B}, [x1], x3 // load pix1
+
+ // These blocks compute the average: avg(pix2[n], pix2[n+1], pix3[n], pix3[n+1])
+ uaddl v23.8H, v2.8B, v4.8B // pix2 + pix2+1 0..7
+ uaddl2 v24.8H, v2.16B, v4.16B // pix2 + pix2+1 8..15
+ uaddl v30.8H, v3.8B, v5.8B // pix3 + pix3+1 0..7
+ uaddl2 v31.8H, v3.16B, v5.16B // pix3 + pix3+1 8..15
+ add v23.8H, v23.8H, v30.8H // add up 0..7
+ add v24.8H, v24.8H, v31.8H // add up 8..15
+ urshr v23.8H, v23.8H, #2 // shift right 2 0..7 (rounding shift right)
+ urshr v24.8H, v24.8H, #2 // shift right 2 8..15
+
+ uaddl v26.8H, v3.8B, v5.8B // pix2 + pix2+1 0..7
+ uaddl2 v27.8H, v3.16B, v5.16B // pix2 + pix2+1 8..15
+ uaddl v2.8H, v7.8B, v16.8B // pix3 + pix3+1 0..7
+ uaddl2 v4.8H, v7.16B, v16.16B // pix3 + pix3+1 8..15
+ add v26.8H, v26.8H, v2.8H // add up 0..7
+ add v27.8H, v27.8H, v4.8H // add up 8..15
+ urshr v26.8H, v26.8H, #2 // shift right 2 0..7 (rounding shift right)
+ urshr v27.8H, v27.8H, #2 // shift right 2 8..15
+
+ uaddl v28.8H, v7.8B, v16.8B // pix2 + pix2+1 0..7
+ uaddl2 v29.8H, v7.16B, v16.16B // pix2 + pix2+1 8..15
+ uaddl v3.8H, v18.8B, v19.8B // pix3 + pix3+1 0..7
+ uaddl2 v5.8H, v18.16B, v19.16B // pix3 + pix3+1 8..15
+ add v28.8H, v28.8H, v3.8H // add up 0..7
+ add v29.8H, v29.8H, v5.8H // add up 8..15
+ urshr v28.8H, v28.8H, #2 // shift right 2 0..7 (rounding shift right)
+ urshr v29.8H, v29.8H, #2 // shift right 2 8..15
+
+ uaddl v30.8H, v18.8B, v19.8B // pix2 + pix2+1 0..7
+ uaddl2 v31.8H, v18.16B, v19.16B // pix2 + pix2+1 8..15
+ uaddl v2.8H, v21.8B, v22.8B // pix3 + pix3+1 0..7
+ uaddl2 v4.8H, v21.16B, v22.16B // pix3 + pix3+1 8..15
+ add v30.8H, v30.8H, v2.8H // add up 0..7
+ add v31.8H, v31.8H, v4.8H // add up 8..15
+ urshr v30.8H, v30.8H, #2 // shift right 2 0..7 (rounding shift right)
+ urshr v31.8H, v31.8H, #2 // shift right 2 8..15
+
+ // Averages are now stored in these registers:
+ // v23, v24
+ // v26, v27
+ // v28, v29
+ // v30, v31
+ // pix1 values in these registers:
+ // v1, v6, v17, v20
+ // available
+ // v2, v3, v4, v5, v7, v16, v18, v19, v25
+
+ uxtl2 v2.8H, v1.16B // 8->16 bits pix1 8..15
+ uxtl v1.8H, v1.8B // 8->16 bits pix1 0..7
+ uxtl2 v7.8H, v6.16B // 8->16 bits pix1 8..15
+ uxtl v6.8H, v6.8B // 8->16 bits pix1 0..7
+ uxtl2 v18.8H, v17.16B // 8->16 bits pix1 8..15
+ uxtl v17.8H, v17.8B // 8->16 bits pix1 0..7
+ uxtl2 v25.8H, v20.16B // 8->16 bits pix1 8..15
+ uxtl v20.8H, v20.8B // 8->16 bits pix1 0..7
+
+ uabd v5.8H, v1.8H, v23.8H // absolute difference 0..7
+ uaba v5.8H, v2.8H, v24.8H // absolute difference accumulate 8..15
+ uaba v5.8H, v6.8H, v26.8H // absolute difference accumulate 0..7
+ uaba v5.8H, v7.8H, v27.8H // absolute difference accumulate 8..15
+ uaba v5.8H, v17.8H, v28.8H // absolute difference accumulate 0..7
+ uaba v5.8H, v18.8H, v29.8H // absolute difference accumulate 8..15
+ uaba v5.8H, v20.8H, v30.8H // absolute difference accumulate 0..7
+ uaba v5.8H, v25.8H, v31.8H // absolute difference accumulate 8..15
+
+ uaddlv s5, v5.8H // add up accumulated values
+ add d0, d0, d5 // add to final result
+
+ mov v2.16B, v21.16B // pix3 -> pix2
+ mov v4.16B, v22.16B // pix3+1 -> pix2+1
+
+ sub w4, w4, #4 // h -= 4
+ cmp w4, #4 // loop if h >= 4
+ b.ge 1b
+ cbnz w4, 2f // if iterations remain jump to completion section
+
+ mov w0, v0.s[0] // copy result to general purpose register
+ ret
+2:
+ // q2 and q4 are set either at the end of this loop or at from the unrolled version
+ // which branches here to complete iterations when h % 4 != 0.
+ ldr q5, [x5, #1] // load pix3+1
+ ld1 {v3.16B}, [x5], x3 // load pix3
+ ld1 {v1.16B}, [x1], x3 // load pix1
+ subs w4, w4, #1 // decrement h and set flags for branch below
+
+ uaddl v16.8H, v2.8B, v4.8B // pix2 + pix2+1 0..7
+ uaddl2 v17.8H, v2.16B, v4.16B // pix2 + pix2+1 8..15
+ uaddl v18.8H, v3.8B, v5.8B // pix3 + pix3+1 0..7
+ uaddl2 v19.8H, v3.16B, v5.16B // pix3 + pix3+1 8..15
+ add v16.8H, v16.8H, v18.8H // add up 0..7
+ add v17.8H, v17.8H, v19.8H // add up 8..15
+ // divide by 4 to compute the average of values summed above
+ urshr v16.8H, v16.8H, #2 // shift right by 2 0..7 (rounding shift right)
+ urshr v17.8H, v17.8H, #2 // shift right by 2 8..15
+
+ uxtl2 v8.8H, v1.16B // 8->16 bits pix1 8..15
+ uxtl v1.8H, v1.8B // 8->16 bits pix1 0..7
+
+ uabd v6.8H, v1.8H, v16.8H // absolute difference 0..7
+ uaba v6.8H, v8.8H, v17.8H // absolute difference accumulate 8..15
+ addv h6, v6.8H // add up accumulator in v6
+ add d0, d0, d6
+
+ mov v2.16B, v3.16B // pix3 -> pix2
+ mov v4.16B, v5.16B // pix3+1 -> pix2+1
+
+ b.ne 2b // branch based on subs instruction above
+ mov w0, v0.s[0] // copy result to general purpose register
+ ret
+endfunc
@@ -1064,6 +1064,8 @@ av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
ff_me_cmp_init_alpha(c, avctx);
if (ARCH_ARM)
ff_me_cmp_init_arm(c, avctx);
+ if (ARCH_AARCH64)
+ ff_me_cmp_init_aarch64(c, avctx);
if (ARCH_PPC)
ff_me_cmp_init_ppc(c, avctx);
if (ARCH_X86)
@@ -82,6 +82,7 @@ typedef struct MECmpContext {
void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);
+void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_mips(MECmpContext *c, AVCodecContext *avctx);
- ff_pix_abs16_neon - ff_pix_abs16_xy2_neon In direct micro benchmarks of these ff functions verses their C implementations, these functions performed as follows on AWS Graviton 2: ff_pix_abs16_neon: c: benchmark ran 100000 iterations in 0.955383 seconds ff: benchmark ran 100000 iterations in 0.097669 seconds ff_pix_abs16_xy2_neon: c: benchmark ran 100000 iterations in 1.916759 seconds ff: benchmark ran 100000 iterations in 0.414291 seconds Signed-off-by: Jonathan Swinney <jswinney@amazon.com> --- libavcodec/aarch64/Makefile | 2 + libavcodec/aarch64/me_cmp_init_aarch64.c | 39 ++++ libavcodec/aarch64/me_cmp_neon.S | 230 +++++++++++++++++++++++ libavcodec/me_cmp.c | 2 + libavcodec/me_cmp.h | 1 + 5 files changed, 274 insertions(+) create mode 100644 libavcodec/aarch64/me_cmp_init_aarch64.c create mode 100644 libavcodec/aarch64/me_cmp_neon.S