diff mbox series

[FFmpeg-devel,3/4] lavc/aarch64: add HEVC sao_band NEON

Message ID 20210107121020.86179-4-josh@itanimul.li
State New
Headers show
Series AArch64 NEON for HEVC
Related show

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Josh Dekker Jan. 7, 2021, 12:10 p.m. UTC
Only works for 8x8.

Signed-off-by: Josh Dekker <josh@itanimul.li>
---
 libavcodec/aarch64/Makefile           |  3 +-
 libavcodec/aarch64/hevcdsp_init.c     |  7 +++
 libavcodec/aarch64/hevcdsp_sao_neon.S | 87 +++++++++++++++++++++++++++
 3 files changed, 96 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/aarch64/hevcdsp_sao_neon.S

Comments

Martin Storsjö Jan. 16, 2021, 11:14 p.m. UTC | #1
On Thu, 7 Jan 2021, Josh Dekker wrote:

> Only works for 8x8.
>
> Signed-off-by: Josh Dekker <josh@itanimul.li>
> ---
> libavcodec/aarch64/Makefile           |  3 +-
> libavcodec/aarch64/hevcdsp_init.c     |  7 +++
> libavcodec/aarch64/hevcdsp_sao_neon.S | 87 +++++++++++++++++++++++++++
> 3 files changed, 96 insertions(+), 1 deletion(-)
> create mode 100644 libavcodec/aarch64/hevcdsp_sao_neon.S
>
> diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
> index 42d80bf74c..1f54fc31f4 100644
> --- a/libavcodec/aarch64/Makefile
> +++ b/libavcodec/aarch64/Makefile
> @@ -55,7 +55,8 @@ NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
> NEON-OBJS-$(CONFIG_AAC_DECODER)         += aarch64/aacpsdsp_neon.o
> NEON-OBJS-$(CONFIG_DCA_DECODER)         += aarch64/synth_filter_neon.o
> NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_add_res_neon.o      \
> -                                           aarch64/hevcdsp_idct_neon.o
> +                                           aarch64/hevcdsp_idct_neon.o         \
> +                                           aarch64/hevcdsp_sao_neon.o
> NEON-OBJS-$(CONFIG_OPUS_DECODER)        += aarch64/opusdsp_neon.o
> NEON-OBJS-$(CONFIG_VORBIS_DECODER)      += aarch64/vorbisdsp_neon.o
> NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \
> diff --git a/libavcodec/aarch64/hevcdsp_init.c b/libavcodec/aarch64/hevcdsp_init.c
> index 2cd7ef3a6c..8f0a923ab1 100644
> --- a/libavcodec/aarch64/hevcdsp_init.c
> +++ b/libavcodec/aarch64/hevcdsp_init.c
> @@ -23,6 +23,11 @@
> #include "libavcodec/hevcdsp.h"
> #include "libavcodec/avcodec.h"
> 
> +void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, uint8_t *_src,
> +                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
> +                                  int16_t *sao_offset_val, int sao_left_class,
> +                                  int width, int height);
> +
> void ff_hevc_idct_4x4_dc_8_neon(int16_t *coeffs);
> void ff_hevc_idct_8x8_dc_8_neon(int16_t *coeffs);
> void ff_hevc_idct_16x16_dc_8_neon(int16_t *coeffs);
> @@ -53,6 +58,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
> {
>     int cpu_flags = av_get_cpu_flags();
>     if (have_neon(cpu_flags) && bit_depth == 8) {
> +        c->sao_band_filter[0]          = ff_hevc_sao_band_filter_8x8_8_neon;
> +
>         c->add_residual[0]             = ff_hevc_add_residual_4x4_8_neon;
>         c->add_residual[1]             = ff_hevc_add_residual_8x8_8_neon;
>         c->add_residual[2]             = ff_hevc_add_residual_16x16_8_neon;
> diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S
> new file mode 100644
> index 0000000000..25b6c25117
> --- /dev/null
> +++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
> @@ -0,0 +1,87 @@
> +/* -*-arm64-*-
> + * vim: syntax=arm64asm
> + *
> + * AArch64 NEON optimised SAO functions for HEVC decoding
> + *
> + * Copyright (c) 2020 Josh Dekker <josh@itanimul.li>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +
> +// void sao_band_filter(uint8_t *_dst, uint8_t *_src,
> +//                      ptrdiff_t stride_dst, ptrdiff_t stride_src,
> +//                      int16_t *sao_offset_val, int sao_left_class,
> +//                      int width, int height)
> +function ff_hevc_sao_band_filter_8x8_8_neon, export=1
> +    sub sp, sp, #64
> +    stp xzr, xzr, [sp]
> +    stp xzr, xzr, [sp, #16]
> +    stp xzr, xzr, [sp, #32]
> +    stp xzr, xzr, [sp, #48]
> +    mov w8, #4
> +.setup:

Please use a numeric anonymous label for this one; gas-preprocessor 
doesn't handle these kinds of labels. This actually creates a real symbol 
(though, a non-external one) here; to make it just a local label, you'd 
prefix it with .L on ELF and with L on MachO. In dav1d there's a macro in 
asm.S for doing this, but for here, I'd just suggest you'd go with 1: 
instead, for simplicity.

> +    ldrsh x9, [x4, x8, lsl #1] // x9 = sao_offset_val[k+1]
> +    subs w8, w8, #1
> +    add w10, w8, w5 // x10 = k + sao_left_class
> +    and w10, w10, #0x1F
> +    strh w9, [sp, x10, lsl #1]
> +    bne .setup
> +    ld1 {v16.16B-v19.16B}, [sp], #64

MSVC's armasm64 can't handle these register lists with a dash; 
gas-preprocessor does try to expand them, but it turned out it had a minor 
bug. I'll send patches for fixing that though.

> +    movi v20.8H, #1
> +0:  // beginning of line
> +    mov w8, w6
> +8:
> +    // Simple layout for accessing 16bit values
> +    // with 8bit LUT.
> +    //
> +    //   00  01  02  03  04  05  06  07
> +    // +----------------------------------->
> +    // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|....
> +    // +----------------------------------->
> +    //    i-0     i-1     i-2     i-3
> +    // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
> +    ld1 {v2.8B}, [x1]
> +    // load src[x]
> +    ushll v0.8H, v2.8B, #0

uxtl

> +    // >> shift
> +    ushr v2.8H, v0.8H, #3 // BIT_DEPTH - 3
> +    // x2 (access lower short)
> +    shl v1.8H, v2.8H, #1 // low (x2, accessing short)
> +    // +1 access upper short
> +    add v3.8H, v1.8H, v20.8H
> +    // shift insert index to upper byte
> +    sli v1.8H, v3.8H, #8
> +    // table
> +    tbx v2.16B, {v16.16B-v19.16B}, v1.16B
> +    // src[x] + table
> +    add v1.8H, v0.8H, v2.8H
> +    // clip + narrow
> +    sqxtun v4.8B, v1.8H
> +    // store
> +    st1 {v4.8B}, [x0]

This all looks very serial to me, most instructions depend on the output 
from the previous one, so most of them will stall for a couple 
instructions. tbx in particular has a rather high latency. If possible, 
I'd suggest unrolling the loop so you operate on two registers of data at 
the same time.

Didn't try following the code algorithm though, but it overall looks 
pretty sensible.

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 42d80bf74c..1f54fc31f4 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -55,7 +55,8 @@  NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
 NEON-OBJS-$(CONFIG_AAC_DECODER)         += aarch64/aacpsdsp_neon.o
 NEON-OBJS-$(CONFIG_DCA_DECODER)         += aarch64/synth_filter_neon.o
 NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_add_res_neon.o      \
-                                           aarch64/hevcdsp_idct_neon.o
+                                           aarch64/hevcdsp_idct_neon.o         \
+                                           aarch64/hevcdsp_sao_neon.o
 NEON-OBJS-$(CONFIG_OPUS_DECODER)        += aarch64/opusdsp_neon.o
 NEON-OBJS-$(CONFIG_VORBIS_DECODER)      += aarch64/vorbisdsp_neon.o
 NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \
diff --git a/libavcodec/aarch64/hevcdsp_init.c b/libavcodec/aarch64/hevcdsp_init.c
index 2cd7ef3a6c..8f0a923ab1 100644
--- a/libavcodec/aarch64/hevcdsp_init.c
+++ b/libavcodec/aarch64/hevcdsp_init.c
@@ -23,6 +23,11 @@ 
 #include "libavcodec/hevcdsp.h"
 #include "libavcodec/avcodec.h"
 
+void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, uint8_t *_src,
+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                  int16_t *sao_offset_val, int sao_left_class,
+                                  int width, int height);
+
 void ff_hevc_idct_4x4_dc_8_neon(int16_t *coeffs);
 void ff_hevc_idct_8x8_dc_8_neon(int16_t *coeffs);
 void ff_hevc_idct_16x16_dc_8_neon(int16_t *coeffs);
@@ -53,6 +58,8 @@  av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
     int cpu_flags = av_get_cpu_flags();
     if (have_neon(cpu_flags) && bit_depth == 8) {
+        c->sao_band_filter[0]          = ff_hevc_sao_band_filter_8x8_8_neon;
+
         c->add_residual[0]             = ff_hevc_add_residual_4x4_8_neon;
         c->add_residual[1]             = ff_hevc_add_residual_8x8_8_neon;
         c->add_residual[2]             = ff_hevc_add_residual_16x16_8_neon;
diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S
new file mode 100644
index 0000000000..25b6c25117
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
@@ -0,0 +1,87 @@ 
+/* -*-arm64-*-
+ * vim: syntax=arm64asm
+ *
+ * AArch64 NEON optimised SAO functions for HEVC decoding
+ *
+ * Copyright (c) 2020 Josh Dekker <josh@itanimul.li>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+// void sao_band_filter(uint8_t *_dst, uint8_t *_src,
+//                      ptrdiff_t stride_dst, ptrdiff_t stride_src,
+//                      int16_t *sao_offset_val, int sao_left_class,
+//                      int width, int height)
+function ff_hevc_sao_band_filter_8x8_8_neon, export=1
+    sub sp, sp, #64
+    stp xzr, xzr, [sp]
+    stp xzr, xzr, [sp, #16]
+    stp xzr, xzr, [sp, #32]
+    stp xzr, xzr, [sp, #48]
+    mov w8, #4
+.setup:
+    ldrsh x9, [x4, x8, lsl #1] // x9 = sao_offset_val[k+1]
+    subs w8, w8, #1
+    add w10, w8, w5 // x10 = k + sao_left_class
+    and w10, w10, #0x1F
+    strh w9, [sp, x10, lsl #1]
+    bne .setup
+    ld1 {v16.16B-v19.16B}, [sp], #64
+    movi v20.8H, #1
+0:  // beginning of line
+    mov w8, w6
+8:
+    // Simple layout for accessing 16bit values
+    // with 8bit LUT.
+    //
+    //   00  01  02  03  04  05  06  07
+    // +----------------------------------->
+    // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|....
+    // +----------------------------------->
+    //    i-0     i-1     i-2     i-3
+    // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
+    ld1 {v2.8B}, [x1]
+    // load src[x]
+    ushll v0.8H, v2.8B, #0
+    // >> shift
+    ushr v2.8H, v0.8H, #3 // BIT_DEPTH - 3
+    // x2 (access lower short)
+    shl v1.8H, v2.8H, #1 // low (x2, accessing short)
+    // +1 access upper short
+    add v3.8H, v1.8H, v20.8H
+    // shift insert index to upper byte
+    sli v1.8H, v3.8H, #8
+    // table
+    tbx v2.16B, {v16.16B-v19.16B}, v1.16B
+    // src[x] + table
+    add v1.8H, v0.8H, v2.8H
+    // clip + narrow
+    sqxtun v4.8B, v1.8H
+    // store
+    st1 {v4.8B}, [x0]
+    // done 8 pixels
+    subs w8, w8, #8
+    bne 8b
+    // finished line
+    subs w7, w7, #1
+    add x0, x0, x2 // dst += stride_dst
+    add x1, x1, x3 // src += stride_src
+    bne 0b
+    ret
+endfunc