diff mbox series

[FFmpeg-devel,4/4] swscale/aarch64: add neon {lum, chr}ConvertRange

Message ID 20240607140543.130761-4-ramiro.polla@gmail.com
State New
Headers show
Series [FFmpeg-devel,1/4] tests/checkasm: cosmetics, one object per line in Makefile | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Ramiro Polla June 7, 2024, 2:05 p.m. UTC
chrRangeFromJpeg_8_c: 28.5
chrRangeFromJpeg_8_neon: 21.2
chrRangeFromJpeg_24_c: 81.2
chrRangeFromJpeg_24_neon: 34.7
chrRangeFromJpeg_128_c: 425.2
chrRangeFromJpeg_128_neon: 162.0
chrRangeFromJpeg_144_c: 480.2
chrRangeFromJpeg_144_neon: 180.2
chrRangeFromJpeg_256_c: 838.2
chrRangeFromJpeg_256_neon: 318.0
chrRangeFromJpeg_512_c: 1698.2
chrRangeFromJpeg_512_neon: 630.0
chrRangeToJpeg_8_c: 56.0
chrRangeToJpeg_8_neon: 23.5
chrRangeToJpeg_24_c: 147.7
chrRangeToJpeg_24_neon: 38.2
chrRangeToJpeg_128_c: 760.2
chrRangeToJpeg_128_neon: 182.5
chrRangeToJpeg_144_c: 857.7
chrRangeToJpeg_144_neon: 204.5
chrRangeToJpeg_256_c: 1504.2
chrRangeToJpeg_256_neon: 358.5
chrRangeToJpeg_512_c: 3025.7
chrRangeToJpeg_512_neon: 710.5
lumRangeFromJpeg_8_c: 24.0
lumRangeFromJpeg_8_neon: 18.2
lumRangeFromJpeg_24_c: 64.0
lumRangeFromJpeg_24_neon: 22.2
lumRangeFromJpeg_128_c: 289.2
lumRangeFromJpeg_128_neon: 79.2
lumRangeFromJpeg_144_c: 334.7
lumRangeFromJpeg_144_neon: 87.7
lumRangeFromJpeg_256_c: 579.5
lumRangeFromJpeg_256_neon: 152.0
lumRangeFromJpeg_512_c: 1208.0
lumRangeFromJpeg_512_neon: 299.0
lumRangeToJpeg_8_c: 30.0
lumRangeToJpeg_8_neon: 19.0
lumRangeToJpeg_24_c: 82.2
lumRangeToJpeg_24_neon: 24.0
lumRangeToJpeg_128_c: 440.7
lumRangeToJpeg_128_neon: 90.5
lumRangeToJpeg_144_c: 502.0
lumRangeToJpeg_144_neon: 102.2
lumRangeToJpeg_256_c: 893.7
lumRangeToJpeg_256_neon: 178.0
lumRangeToJpeg_512_c: 1793.7
lumRangeToJpeg_512_neon: 355.0
---
 libswscale/aarch64/Makefile             |   1 +
 libswscale/aarch64/range_convert_neon.S | 103 ++++++++++++++++++++++++
 libswscale/aarch64/swscale.c            |  21 +++++
 libswscale/swscale_internal.h           |   1 +
 libswscale/utils.c                      |   4 +-
 5 files changed, 129 insertions(+), 1 deletion(-)
 create mode 100644 libswscale/aarch64/range_convert_neon.S

Comments

Martin Storsjö June 10, 2024, 11:56 a.m. UTC | #1
On Fri, 7 Jun 2024, Ramiro Polla wrote:

> chrRangeFromJpeg_8_c: 28.5
> chrRangeFromJpeg_8_neon: 21.2
> chrRangeFromJpeg_24_c: 81.2
> chrRangeFromJpeg_24_neon: 34.7
> chrRangeFromJpeg_128_c: 425.2
> chrRangeFromJpeg_128_neon: 162.0
> chrRangeFromJpeg_144_c: 480.2
> chrRangeFromJpeg_144_neon: 180.2
> chrRangeFromJpeg_256_c: 838.2
> chrRangeFromJpeg_256_neon: 318.0
> chrRangeFromJpeg_512_c: 1698.2
> chrRangeFromJpeg_512_neon: 630.0
> chrRangeToJpeg_8_c: 56.0
> chrRangeToJpeg_8_neon: 23.5
> chrRangeToJpeg_24_c: 147.7
> chrRangeToJpeg_24_neon: 38.2
> chrRangeToJpeg_128_c: 760.2
> chrRangeToJpeg_128_neon: 182.5
> chrRangeToJpeg_144_c: 857.7
> chrRangeToJpeg_144_neon: 204.5
> chrRangeToJpeg_256_c: 1504.2
> chrRangeToJpeg_256_neon: 358.5
> chrRangeToJpeg_512_c: 3025.7
> chrRangeToJpeg_512_neon: 710.5
> lumRangeFromJpeg_8_c: 24.0
> lumRangeFromJpeg_8_neon: 18.2
> lumRangeFromJpeg_24_c: 64.0
> lumRangeFromJpeg_24_neon: 22.2
> lumRangeFromJpeg_128_c: 289.2
> lumRangeFromJpeg_128_neon: 79.2
> lumRangeFromJpeg_144_c: 334.7
> lumRangeFromJpeg_144_neon: 87.7
> lumRangeFromJpeg_256_c: 579.5
> lumRangeFromJpeg_256_neon: 152.0
> lumRangeFromJpeg_512_c: 1208.0
> lumRangeFromJpeg_512_neon: 299.0
> lumRangeToJpeg_8_c: 30.0
> lumRangeToJpeg_8_neon: 19.0
> lumRangeToJpeg_24_c: 82.2
> lumRangeToJpeg_24_neon: 24.0
> lumRangeToJpeg_128_c: 440.7
> lumRangeToJpeg_128_neon: 90.5
> lumRangeToJpeg_144_c: 502.0
> lumRangeToJpeg_144_neon: 102.2
> lumRangeToJpeg_256_c: 893.7
> lumRangeToJpeg_256_neon: 178.0
> lumRangeToJpeg_512_c: 1793.7
> lumRangeToJpeg_512_neon: 355.0
> ---
> libswscale/aarch64/Makefile             |   1 +
> libswscale/aarch64/range_convert_neon.S | 103 ++++++++++++++++++++++++
> libswscale/aarch64/swscale.c            |  21 +++++
> libswscale/swscale_internal.h           |   1 +
> libswscale/utils.c                      |   4 +-
> 5 files changed, 129 insertions(+), 1 deletion(-)
> create mode 100644 libswscale/aarch64/range_convert_neon.S
>
> diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
> index da1d909561..6923827f82 100644
> --- a/libswscale/aarch64/Makefile
> +++ b/libswscale/aarch64/Makefile
> @@ -4,5 +4,6 @@ OBJS        += aarch64/rgb2rgb.o                \
>
> NEON-OBJS   += aarch64/hscale.o                 \
>                aarch64/output.o                 \
> +               aarch64/range_convert_neon.o     \
>                aarch64/rgb2rgb_neon.o           \
>                aarch64/yuv2rgb_neon.o           \
> diff --git a/libswscale/aarch64/range_convert_neon.S b/libswscale/aarch64/range_convert_neon.S
> new file mode 100644
> index 0000000000..5e104971f0
> --- /dev/null
> +++ b/libswscale/aarch64/range_convert_neon.S
> @@ -0,0 +1,103 @@
> +/*
> + * Copyright (c) 2024 Ramiro Polla
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +
> +.macro lumConvertRange name max mult offset shift

We usually use commas between the macro arguments here. Apparently it 
doesn't make any difference for any of the tools we support, but it would 
be nice for consistency. (When invoking macros, commas between arguments 
are optional for most platforms, but not when targeting Apple platforms, 
so being strict with consistent use of commas is generally good.)

> +const offset_\name, align=4
> +        .word \offset, \offset, \offset, \offset
> +endconst
> +function ff_\name, export=1
> +.if \max != 0
> +        mov             w3, #\max
> +        dup             v24.8h, w3
> +.endif
> +        mov             w3, #\mult
> +        dup             v25.4s, w3
> +        movrel          x3, offset_\name
> +        ld1             {v26.4s}, [x3]

FWIW, I did see that you were recommended this form, over ld1r, based on 
some microarchitectural performance numbers. However in our preexisting 
assembly, manually pre-splatting vectors like this is unusual I would say. 
I don't have a strong opinion on the matter though.

Anyway, the assembly looks reasonable to me.

// Martin
Ramiro Polla June 11, 2024, 12:33 p.m. UTC | #2
On Mon, Jun 10, 2024 at 1:56 PM Martin Storsjö <martin@martin.st> wrote:
> On Fri, 7 Jun 2024, Ramiro Polla wrote:
>
> > chrRangeFromJpeg_8_c: 28.5
> > chrRangeFromJpeg_8_neon: 21.2
> > chrRangeFromJpeg_24_c: 81.2
> > chrRangeFromJpeg_24_neon: 34.7
> > chrRangeFromJpeg_128_c: 425.2
> > chrRangeFromJpeg_128_neon: 162.0
> > chrRangeFromJpeg_144_c: 480.2
> > chrRangeFromJpeg_144_neon: 180.2
> > chrRangeFromJpeg_256_c: 838.2
> > chrRangeFromJpeg_256_neon: 318.0
> > chrRangeFromJpeg_512_c: 1698.2
> > chrRangeFromJpeg_512_neon: 630.0
> > chrRangeToJpeg_8_c: 56.0
> > chrRangeToJpeg_8_neon: 23.5
> > chrRangeToJpeg_24_c: 147.7
> > chrRangeToJpeg_24_neon: 38.2
> > chrRangeToJpeg_128_c: 760.2
> > chrRangeToJpeg_128_neon: 182.5
> > chrRangeToJpeg_144_c: 857.7
> > chrRangeToJpeg_144_neon: 204.5
> > chrRangeToJpeg_256_c: 1504.2
> > chrRangeToJpeg_256_neon: 358.5
> > chrRangeToJpeg_512_c: 3025.7
> > chrRangeToJpeg_512_neon: 710.5
> > lumRangeFromJpeg_8_c: 24.0
> > lumRangeFromJpeg_8_neon: 18.2
> > lumRangeFromJpeg_24_c: 64.0
> > lumRangeFromJpeg_24_neon: 22.2
> > lumRangeFromJpeg_128_c: 289.2
> > lumRangeFromJpeg_128_neon: 79.2
> > lumRangeFromJpeg_144_c: 334.7
> > lumRangeFromJpeg_144_neon: 87.7
> > lumRangeFromJpeg_256_c: 579.5
> > lumRangeFromJpeg_256_neon: 152.0
> > lumRangeFromJpeg_512_c: 1208.0
> > lumRangeFromJpeg_512_neon: 299.0
> > lumRangeToJpeg_8_c: 30.0
> > lumRangeToJpeg_8_neon: 19.0
> > lumRangeToJpeg_24_c: 82.2
> > lumRangeToJpeg_24_neon: 24.0
> > lumRangeToJpeg_128_c: 440.7
> > lumRangeToJpeg_128_neon: 90.5
> > lumRangeToJpeg_144_c: 502.0
> > lumRangeToJpeg_144_neon: 102.2
> > lumRangeToJpeg_256_c: 893.7
> > lumRangeToJpeg_256_neon: 178.0
> > lumRangeToJpeg_512_c: 1793.7
> > lumRangeToJpeg_512_neon: 355.0
> > ---
> > libswscale/aarch64/Makefile             |   1 +
> > libswscale/aarch64/range_convert_neon.S | 103 ++++++++++++++++++++++++
> > libswscale/aarch64/swscale.c            |  21 +++++
> > libswscale/swscale_internal.h           |   1 +
> > libswscale/utils.c                      |   4 +-
> > 5 files changed, 129 insertions(+), 1 deletion(-)
> > create mode 100644 libswscale/aarch64/range_convert_neon.S
> >
> > diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
> > index da1d909561..6923827f82 100644
> > --- a/libswscale/aarch64/Makefile
> > +++ b/libswscale/aarch64/Makefile
> > @@ -4,5 +4,6 @@ OBJS        += aarch64/rgb2rgb.o                \
> >
> > NEON-OBJS   += aarch64/hscale.o                 \
> >                aarch64/output.o                 \
> > +               aarch64/range_convert_neon.o     \
> >                aarch64/rgb2rgb_neon.o           \
> >                aarch64/yuv2rgb_neon.o           \
> > diff --git a/libswscale/aarch64/range_convert_neon.S b/libswscale/aarch64/range_convert_neon.S
> > new file mode 100644
> > index 0000000000..5e104971f0
> > --- /dev/null
> > +++ b/libswscale/aarch64/range_convert_neon.S
> > @@ -0,0 +1,103 @@
> > +/*
> > + * Copyright (c) 2024 Ramiro Polla
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> > + */
> > +
> > +#include "libavutil/aarch64/asm.S"
> > +
> > +.macro lumConvertRange name max mult offset shift
>
> We usually use commas between the macro arguments here. Apparently it
> doesn't make any difference for any of the tools we support, but it would
> be nice for consistency. (When invoking macros, commas between arguments
> are optional for most platforms, but not when targeting Apple platforms,
> so being strict with consistent use of commas is generally good.)

Fixed in the new patchset.

> > +const offset_\name, align=4
> > +        .word \offset, \offset, \offset, \offset
> > +endconst
> > +function ff_\name, export=1
> > +.if \max != 0
> > +        mov             w3, #\max
> > +        dup             v24.8h, w3
> > +.endif
> > +        mov             w3, #\mult
> > +        dup             v25.4s, w3
> > +        movrel          x3, offset_\name
> > +        ld1             {v26.4s}, [x3]
>
> FWIW, I did see that you were recommended this form, over ld1r, based on
> some microarchitectural performance numbers. However in our preexisting
> assembly, manually pre-splatting vectors like this is unusual I would say.
> I don't have a strong opinion on the matter though.
>
> Anyway, the assembly looks reasonable to me.

I changed it to movz/movk/dup in the new patchset (tested on rpi5, but
not on macos).

Thanks,
Ramiro
diff mbox series

Patch

diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
index da1d909561..6923827f82 100644
--- a/libswscale/aarch64/Makefile
+++ b/libswscale/aarch64/Makefile
@@ -4,5 +4,6 @@  OBJS        += aarch64/rgb2rgb.o                \
 
 NEON-OBJS   += aarch64/hscale.o                 \
                aarch64/output.o                 \
+               aarch64/range_convert_neon.o     \
                aarch64/rgb2rgb_neon.o           \
                aarch64/yuv2rgb_neon.o           \
diff --git a/libswscale/aarch64/range_convert_neon.S b/libswscale/aarch64/range_convert_neon.S
new file mode 100644
index 0000000000..5e104971f0
--- /dev/null
+++ b/libswscale/aarch64/range_convert_neon.S
@@ -0,0 +1,103 @@ 
+/*
+ * Copyright (c) 2024 Ramiro Polla
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+.macro lumConvertRange name max mult offset shift
+const offset_\name, align=4
+        .word \offset, \offset, \offset, \offset
+endconst
+function ff_\name, export=1
+.if \max != 0
+        mov             w3, #\max
+        dup             v24.8h, w3
+.endif
+        mov             w3, #\mult
+        dup             v25.4s, w3
+        movrel          x3, offset_\name
+        ld1             {v26.4s}, [x3]
+1:
+        ld1             {v0.8h}, [x0]
+.if \max != 0
+        smin            v0.8h, v0.8h, v24.8h
+.endif
+        mov             v16.16b, v26.16b
+        mov             v18.16b, v26.16b
+        sxtl            v20.4s, v0.4h
+        sxtl2           v22.4s, v0.8h
+        mla             v16.4s, v20.4s, v25.4s
+        mla             v18.4s, v22.4s, v25.4s
+        shrn            v0.4h, v16.4s, #\shift
+        shrn2           v0.8h, v18.4s, #\shift
+        subs            w1, w1, #8
+        st1             {v0.8h}, [x0], #16
+        b.gt            1b
+        ret
+endfunc
+.endm
+
+.macro chrConvertRange name max mult offset shift
+const offset_\name, align=4
+        .word \offset, \offset, \offset, \offset
+endconst
+function ff_\name, export=1
+.if \max != 0
+        mov             w3, #\max
+        dup             v24.8h, w3
+.endif
+        mov             w3, #\mult
+        dup             v25.4s, w3
+        movrel          x3, offset_\name
+        ld1             {v26.4s}, [x3]
+1:
+        ld1             {v0.8h}, [x0]
+        ld1             {v1.8h}, [x1]
+.if \max != 0
+        smin            v0.8h, v0.8h, v24.8h
+        smin            v1.8h, v1.8h, v24.8h
+.endif
+        mov             v16.16b, v26.16b
+        mov             v17.16b, v26.16b
+        mov             v18.16b, v26.16b
+        mov             v19.16b, v26.16b
+        sxtl            v20.4s, v0.4h
+        sxtl            v21.4s, v1.4h
+        sxtl2           v22.4s, v0.8h
+        sxtl2           v23.4s, v1.8h
+        mla             v16.4s, v20.4s, v25.4s
+        mla             v17.4s, v21.4s, v25.4s
+        mla             v18.4s, v22.4s, v25.4s
+        mla             v19.4s, v23.4s, v25.4s
+        shrn            v0.4h, v16.4s, #\shift
+        shrn            v1.4h, v17.4s, #\shift
+        shrn2           v0.8h, v18.4s, #\shift
+        shrn2           v1.8h, v19.4s, #\shift
+        subs            w2, w2, #8
+        st1             {v0.8h}, [x0], #16
+        st1             {v1.8h}, [x1], #16
+        b.gt            1b
+        ret
+endfunc
+.endm
+
+lumConvertRange lumRangeToJpeg_neon,   30189, 19077, -39057361, 14
+chrConvertRange chrRangeToJpeg_neon,   30775,  4663,  -9289992, 12
+lumConvertRange lumRangeFromJpeg_neon,     0, 14071,  33561947, 14
+chrConvertRange chrRangeFromJpeg_neon,     0,  1799,   4081085, 11
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index bbd9719a44..7344f75b2e 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -201,6 +201,26 @@  void ff_yuv2plane1_8_neon(
     default: break;                                                     \
     }
 
+void ff_lumRangeFromJpeg_neon(int16_t *dst, int width);
+void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
+void ff_lumRangeToJpeg_neon(int16_t *dst, int width);
+void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
+
+av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c)
+{
+    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
+        if (c->dstBpc <= 14) {
+            if (c->srcRange) {
+                c->lumConvertRange = ff_lumRangeFromJpeg_neon;
+                c->chrConvertRange = ff_chrRangeFromJpeg_neon;
+            } else {
+                c->lumConvertRange = ff_lumRangeToJpeg_neon;
+                c->chrConvertRange = ff_chrRangeToJpeg_neon;
+            }
+        }
+    }
+}
+
 av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -212,5 +232,6 @@  av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
         if (c->dstBpc == 8) {
             c->yuv2planeX = ff_yuv2planeX_8_neon;
         }
+        ff_sws_init_range_convert_aarch64(c);
     }
 }
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 92f6105443..1059f8a6de 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -697,6 +697,7 @@  void ff_yuv2rgb_init_tables_ppc(SwsContext *c, const int inv_table[4],
 void ff_updateMMXDitherTables(SwsContext *c, int dstY);
 
 av_cold void ff_sws_init_range_convert(SwsContext *c);
+av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c);
 av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c);
 av_cold void ff_sws_init_range_convert_x86(SwsContext *c);
 
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 8dfa57b5ff..12dba712c1 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -1080,7 +1080,9 @@  int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
 
     if (need_reinit) {
         ff_sws_init_range_convert(c);
-#if ARCH_LOONGARCH64
+#if ARCH_AARCH64
+        ff_sws_init_range_convert_aarch64(c);
+#elif ARCH_LOONGARCH64
         ff_sws_init_range_convert_loongarch(c);
 #elif ARCH_X86
         ff_sws_init_range_convert_x86(c);