diff mbox series

[FFmpeg-devel] lavc/aarch64: add some neon pix_abs functions

Message ID 76da7cb539cd456aa3c25c8470c9c4b5@EX13D07UWB004.ant.amazon.com
State Superseded
Headers show
Series [FFmpeg-devel] lavc/aarch64: add some neon pix_abs functions | expand

Checks

Context Check Description
andriy/make_aarch64_jetson success Make finished
andriy/make_fate_aarch64_jetson fail Make fate failed

Commit Message

Swinney, Jonathan March 4, 2022, 10:52 p.m. UTC
- ff_pix_abs16_neon
 - ff_pix_abs16_xy2_neon

In direct micro benchmarks of these ff functions verses their C implementations,
these functions performed as follows on AWS Graviton 2:

ff_pix_abs16_neon:
c:  benchmark ran 100000 iterations in 0.955383 seconds
ff: benchmark ran 100000 iterations in 0.097669 seconds

ff_pix_abs16_xy2_neon:
c:  benchmark ran 100000 iterations in 1.916759 seconds
ff: benchmark ran 100000 iterations in 0.414291 seconds

Signed-off-by: Jonathan Swinney <jswinney@amazon.com>
---
 libavcodec/aarch64/Makefile              |   2 +
 libavcodec/aarch64/me_cmp_init_aarch64.c |  39 ++++
 libavcodec/aarch64/me_cmp_neon.S         | 230 +++++++++++++++++++++++
 libavcodec/me_cmp.c                      |   2 +
 libavcodec/me_cmp.h                      |   1 +
 5 files changed, 274 insertions(+)
 create mode 100644 libavcodec/aarch64/me_cmp_init_aarch64.c
 create mode 100644 libavcodec/aarch64/me_cmp_neon.S

Comments

Martin Storsjö March 14, 2022, 10:39 p.m. UTC | #1
On Mon, 7 Mar 2022, Pop, Sebastian wrote:

> Here are a few suggestions:
>
>> +        add     d18, d17, d18               // add to the end result register
>> [...]
>> +        mov     w0, v18.S[0]                // copy result to general purpose register
>
> I think you can use 32-bit register s18 instead of d18.
> The mov with indexed vector is more expensive than fmov.

Oh, I hadn't considered that. In a tight loop, I can indeed measure a 
quite significant difference between those.

> add    s18, s18, s17
> fmov  w0, s18
>
>> +        subs    w4, w4, #1                  // decrement h and set flags for branch below
>> [...]
>> +        b.ne    2b                          // branch based on subs instruction above
>
> Please avoid the flags register to branch.
> Instead you could do:
>
> sub   w4, w4, #1
> cbnz w4, 2b

If there are other instructions between the sub and the b.ne, does this 
make any difference? (In most cases one can move the decrement into a 
suitable gap early in the loop anyway.) I.e. if the flags register already 
is set since long ago, naively I'd expect that b.ne would be faster (or at 
least not slower) than cbnz.

Some benchmarking on Cortex A53, A72 and A73 seems to agree with my 
expectations too. (It'd be good if we'd have the patch at hand hooked up 
in checkasm, so that we could measure and compare exactly the function at 
hand.)

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 954461f81d..18869da1b4 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -7,6 +7,7 @@  OBJS-$(CONFIG_H264PRED)                 += aarch64/h264pred_init.o
 OBJS-$(CONFIG_H264QPEL)                 += aarch64/h264qpel_init_aarch64.o
 OBJS-$(CONFIG_HPELDSP)                  += aarch64/hpeldsp_init_aarch64.o
 OBJS-$(CONFIG_IDCTDSP)                  += aarch64/idctdsp_init_aarch64.o
+OBJS-$(CONFIG_ME_CMP)                   += aarch64/me_cmp_init_aarch64.o
 OBJS-$(CONFIG_MPEGAUDIODSP)             += aarch64/mpegaudiodsp_init.o
 OBJS-$(CONFIG_NEON_CLOBBER_TEST)        += aarch64/neontest.o
 OBJS-$(CONFIG_PIXBLOCKDSP)              += aarch64/pixblockdsp_init_aarch64.o
@@ -46,6 +47,7 @@  NEON-OBJS-$(CONFIG_H264QPEL)            += aarch64/h264qpel_neon.o             \
 NEON-OBJS-$(CONFIG_HPELDSP)             += aarch64/hpeldsp_neon.o
 NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/simple_idct_neon.o
 NEON-OBJS-$(CONFIG_MDCT)                += aarch64/mdct_neon.o
+NEON-OBJS-$(CONFIG_ME_CMP)              += aarch64/me_cmp_neon.o
 NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
 NEON-OBJS-$(CONFIG_PIXBLOCKDSP)         += aarch64/pixblockdsp_neon.o
 NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
new file mode 100644
index 0000000000..fb827daaf5
--- /dev/null
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -0,0 +1,39 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/mpegvideo.h"
+
+int ff_pix_abs16_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
+                       ptrdiff_t stride, int h);
+int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
+                       ptrdiff_t stride, int h);
+
+av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        c->pix_abs[0][0] = ff_pix_abs16_neon;
+        c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
+    }
+}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
new file mode 100644
index 0000000000..5422061ab1
--- /dev/null
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -0,0 +1,230 @@ 
+/*
+ * Copyright (c) 2022 Jonathan Swinney <jswinney@amazon.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+function ff_pix_abs16_neon, export=1
+        // x0   unused
+        // x1   uint8_t *pix1
+        // x2   uint8_t *pix2
+        // x3   ptrdiff_t stride
+        // w4   int h
+        // x5   uint8_t *pix3
+        cmp     w4, #4                      // if h < 4, jump to completion section
+        b.lt    2f
+        movi    v18.4S, #0                  // clear result accumulator
+1:
+        movi    v16.8H, #0                  // clear uabal accumulator
+        ld1     {v0.16B}, [x1], x3          // load pix1
+        ld1     {v4.16B}, [x2], x3          // load pix2
+        ld1     {v1.16B}, [x1], x3          // load pix1
+        ld1     {v5.16B}, [x2], x3          // load pix2
+        uabal   v16.8H, v0.8B, v4.8B        // absolute difference accumulate
+        uabal2  v16.8H, v0.16B, v4.16B
+        ld1     {v2.16B}, [x1], x3          // load pix1
+        ld1     {v6.16B}, [x2], x3          // load pix2
+        uabal   v16.8H, v1.8B, v5.8B        // absolute difference accumulate
+        uabal2  v16.8H, v1.16B, v5.16B
+        ld1     {v3.16B}, [x1], x3
+        ld1     {v7.16B}, [x2], x3
+        uabal   v16.8H, v2.8B, v6.8B
+        uabal2  v16.8H, v2.16B, v6.16B
+        sub     w4, w4, #4                  // h -= 4
+        uabal   v16.8H, v3.8B, v7.8B
+        uabal2  v16.8H, v3.16B, v7.16B
+        cmp     w4, #4                      // if h >= 4, loop
+        addv    h17, v16.8H                 // add up everything in v16 accumulator
+        add     d18, d17, d18               // add to the end result register
+
+        b.ge    1b
+        cbnz    w4, 2f                      // if iterations remain, jump to completion section
+
+        mov     w0, v18.S[0]                // copy result to general purpose register
+        ret
+
+2:
+        movi    v16.8H, #0                  // clear the uabal accumulator
+        ld1     {v0.16B}, [x1]              // load pix1
+        ld1     {v4.16B}, [x2]              // load pix2
+        add     x1, x1, x3                  // increment pointers
+        add     x2, x2, x3
+        uabal   v16.8H, v0.8B, v4.8B        // absolute difference accumulate
+        uabal2  v16.8H, v0.16B, v4.16B
+        addv    h17, v16.8H                 // add up v16
+        add     d18, d17, d18               // add to result
+        subs    w4, w4, #1                  // h -= 1
+        b.ne    2b
+
+        mov     w0, v18.S[0]                // copy result to general purpose register
+        ret
+endfunc
+
+function ff_pix_abs16_xy2_neon, export=1
+        // x0   unused
+        // x1   uint8_t *pix1
+        // x2   uint8_t *pix2
+        // x3   ptrdiff_t stride
+        // w4   int h
+        // x5   uint8_t *pix3
+        add     x5, x2, x3                  // create a pointer for pix3
+        movi    v0.2D, #0                   // initialize the result register
+
+        // I also tested these intructions to get pix2+1 from pix2, but it wasn't faster
+        // than just doing another full (unaligned) load.
+        // ldr     b21, [x5, #16]
+        // ushr    v4.2D, v2.2D, #8
+        // mov     v4.16B[15], v21.16B[0]
+        // mov     v4.16B[7], v2.16B[8]
+
+        // Load initial pix2 values for either the unrolled version of completion version.
+        ldr     q4, [x2, #1]                // load pix2+1
+        ldr     q2, [x2]                    // load pix2
+        cmp     w4, #4                      // if h < 4 jump to the completion version
+        b.lt    2f
+1:
+        // This is an unrolled implemntation. It completes 4 iterations of the C for each branch.
+        // In each iteration, pix2[i+1] == pix3[i]. This means we need only three loads per iteration,
+        // plus two at the begining to start.
+        ldr     q5, [x5, #1]                // load pix3+1
+        ld1     {v3.16B}, [x5], x3          // load pix3
+        ld1     {v1.16B}, [x1], x3          // load pix1
+
+        ldr     q16, [x5, #1]               // load pix3+1
+        ld1     {v7.16B}, [x5], x3          // load pix3
+        ld1     {v6.16B}, [x1], x3          // load pix1
+
+        ldr     q19, [x5, #1]               // load pix3+1
+        ld1     {v18.16B}, [x5], x3         // load pix3
+        ld1     {v17.16B}, [x1], x3         // load pix1
+
+        ldr     q22, [x5, #1]               // load pix3+1
+        ld1     {v21.16B}, [x5], x3         // load pix3
+        ld1     {v20.16B}, [x1], x3         // load pix1
+
+        // These blocks compute the average: avg(pix2[n], pix2[n+1], pix3[n], pix3[n+1])
+        uaddl   v23.8H, v2.8B, v4.8B        // pix2 + pix2+1 0..7
+        uaddl2  v24.8H, v2.16B, v4.16B      // pix2 + pix2+1 8..15
+        uaddl   v30.8H, v3.8B, v5.8B        // pix3 + pix3+1 0..7
+        uaddl2  v31.8H, v3.16B, v5.16B      // pix3 + pix3+1 8..15
+        add     v23.8H, v23.8H, v30.8H      // add up 0..7
+        add     v24.8H, v24.8H, v31.8H      // add up 8..15
+        urshr   v23.8H, v23.8H, #2          // shift right 2 0..7 (rounding shift right)
+        urshr   v24.8H, v24.8H, #2          // shift right 2 8..15
+
+        uaddl   v26.8H, v3.8B, v5.8B        // pix2 + pix2+1 0..7
+        uaddl2  v27.8H, v3.16B, v5.16B      // pix2 + pix2+1 8..15
+        uaddl   v2.8H, v7.8B, v16.8B        // pix3 + pix3+1 0..7
+        uaddl2  v4.8H, v7.16B, v16.16B      // pix3 + pix3+1 8..15
+        add     v26.8H, v26.8H, v2.8H       // add up 0..7
+        add     v27.8H, v27.8H, v4.8H       // add up 8..15
+        urshr   v26.8H, v26.8H, #2          // shift right 2 0..7 (rounding shift right)
+        urshr   v27.8H, v27.8H, #2          // shift right 2 8..15
+
+        uaddl   v28.8H, v7.8B, v16.8B       // pix2 + pix2+1 0..7
+        uaddl2  v29.8H, v7.16B, v16.16B     // pix2 + pix2+1 8..15
+        uaddl   v3.8H, v18.8B, v19.8B       // pix3 + pix3+1 0..7
+        uaddl2  v5.8H, v18.16B, v19.16B     // pix3 + pix3+1 8..15
+        add     v28.8H, v28.8H, v3.8H       // add up 0..7
+        add     v29.8H, v29.8H, v5.8H       // add up 8..15
+        urshr   v28.8H, v28.8H, #2          // shift right 2 0..7 (rounding shift right)
+        urshr   v29.8H, v29.8H, #2          // shift right 2 8..15
+
+        uaddl   v30.8H, v18.8B, v19.8B      // pix2 + pix2+1 0..7
+        uaddl2  v31.8H, v18.16B, v19.16B    // pix2 + pix2+1 8..15
+        uaddl   v2.8H, v21.8B, v22.8B       // pix3 + pix3+1 0..7
+        uaddl2  v4.8H, v21.16B, v22.16B     // pix3 + pix3+1 8..15
+        add     v30.8H, v30.8H, v2.8H       // add up 0..7
+        add     v31.8H, v31.8H, v4.8H       // add up 8..15
+        urshr   v30.8H, v30.8H, #2          // shift right 2 0..7 (rounding shift right)
+        urshr   v31.8H, v31.8H, #2          // shift right 2 8..15
+
+        // Averages are now stored in these registers:
+        // v23, v24
+        // v26, v27
+        // v28, v29
+        // v30, v31
+        // pix1 values in these registers:
+        // v1, v6, v17, v20
+        // available
+        // v2, v3, v4, v5, v7, v16, v18, v19, v25
+
+        uxtl2   v2.8H, v1.16B               // 8->16 bits pix1 8..15
+        uxtl    v1.8H, v1.8B                // 8->16 bits pix1 0..7
+        uxtl2   v7.8H, v6.16B               // 8->16 bits pix1 8..15
+        uxtl    v6.8H, v6.8B                // 8->16 bits pix1 0..7
+        uxtl2   v18.8H, v17.16B             // 8->16 bits pix1 8..15
+        uxtl    v17.8H, v17.8B              // 8->16 bits pix1 0..7
+        uxtl2   v25.8H, v20.16B             // 8->16 bits pix1 8..15
+        uxtl    v20.8H, v20.8B              // 8->16 bits pix1 0..7
+
+        uabd    v5.8H, v1.8H, v23.8H        // absolute difference 0..7
+        uaba    v5.8H, v2.8H, v24.8H        // absolute difference accumulate 8..15
+        uaba    v5.8H, v6.8H, v26.8H        // absolute difference accumulate 0..7
+        uaba    v5.8H, v7.8H, v27.8H        // absolute difference accumulate 8..15
+        uaba    v5.8H, v17.8H, v28.8H       // absolute difference accumulate 0..7
+        uaba    v5.8H, v18.8H, v29.8H       // absolute difference accumulate 8..15
+        uaba    v5.8H, v20.8H, v30.8H       // absolute difference accumulate 0..7
+        uaba    v5.8H, v25.8H, v31.8H       // absolute difference accumulate 8..15
+
+        uaddlv  s5, v5.8H                   // add up accumulated values
+        add     d0, d0, d5                  // add to final result
+
+        mov     v2.16B, v21.16B             // pix3 -> pix2
+        mov     v4.16B, v22.16B             // pix3+1 -> pix2+1
+
+        sub     w4, w4, #4                  // h -= 4
+        cmp     w4, #4                      // loop if h >= 4
+        b.ge    1b
+        cbnz    w4, 2f                      // if iterations remain jump to completion section
+
+        mov w0, v0.s[0]                     // copy result to general purpose register
+        ret
+2:
+        // q2 and q4 are set either at the end of this loop or at from the unrolled version
+        // which branches here to complete iterations when h % 4 != 0.
+        ldr     q5, [x5, #1]                // load pix3+1
+        ld1     {v3.16B}, [x5], x3          // load pix3
+        ld1     {v1.16B}, [x1], x3          // load pix1
+        subs    w4, w4, #1                  // decrement h and set flags for branch below
+
+        uaddl   v16.8H, v2.8B, v4.8B        // pix2 + pix2+1 0..7
+        uaddl2  v17.8H, v2.16B, v4.16B      // pix2 + pix2+1 8..15
+        uaddl   v18.8H, v3.8B, v5.8B        // pix3 + pix3+1 0..7
+        uaddl2  v19.8H, v3.16B, v5.16B      // pix3 + pix3+1 8..15
+        add     v16.8H, v16.8H, v18.8H      // add up 0..7
+        add     v17.8H, v17.8H, v19.8H      // add up 8..15
+        // divide by 4 to compute the average of values summed above
+        urshr   v16.8H, v16.8H, #2          // shift right by 2 0..7 (rounding shift right)
+        urshr   v17.8H, v17.8H, #2          // shift right by 2 8..15
+
+        uxtl2   v8.8H, v1.16B               // 8->16 bits pix1 8..15
+        uxtl    v1.8H, v1.8B                // 8->16 bits pix1 0..7
+
+        uabd    v6.8H, v1.8H, v16.8H        // absolute difference 0..7
+        uaba    v6.8H, v8.8H, v17.8H        // absolute difference accumulate 8..15
+        addv    h6, v6.8H                   // add up accumulator in v6
+        add     d0, d0, d6
+
+        mov     v2.16B, v3.16B              // pix3 -> pix2
+        mov     v4.16B, v5.16B              // pix3+1 -> pix2+1
+
+        b.ne    2b                          // branch based on subs instruction above
+        mov w0, v0.s[0]                     // copy result to general purpose register
+        ret
+endfunc
diff --git a/libavcodec/me_cmp.c b/libavcodec/me_cmp.c
index b2f87d2e1b..60053a1b92 100644
--- a/libavcodec/me_cmp.c
+++ b/libavcodec/me_cmp.c
@@ -1064,6 +1064,8 @@  av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
         ff_me_cmp_init_alpha(c, avctx);
     if (ARCH_ARM)
         ff_me_cmp_init_arm(c, avctx);
+    if (ARCH_AARCH64)
+        ff_me_cmp_init_aarch64(c, avctx);
     if (ARCH_PPC)
         ff_me_cmp_init_ppc(c, avctx);
     if (ARCH_X86)
diff --git a/libavcodec/me_cmp.h b/libavcodec/me_cmp.h
index e9b5161c9a..4dd059223d 100644
--- a/libavcodec/me_cmp.h
+++ b/libavcodec/me_cmp.h
@@ -82,6 +82,7 @@  typedef struct MECmpContext {
 void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);
+void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_mips(MECmpContext *c, AVCodecContext *avctx);