diff mbox series

[FFmpeg-devel,v2,4/4] avcodec/aarch64/hevcdsp: add sao_band NEON

Message ID 20210204113259.20112-5-josh@itanimul.li
State New
Headers show
Series avcodec/aarch64/hevcdsp | expand

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Josh Dekker Feb. 4, 2021, 11:32 a.m. UTC
Only works for 8x8.

Signed-off-by: Josh Dekker <josh@itanimul.li>
---
 libavcodec/aarch64/Makefile               |  3 +-
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  7 ++
 libavcodec/aarch64/hevcdsp_sao_neon.S     | 87 +++++++++++++++++++++++
 3 files changed, 96 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/aarch64/hevcdsp_sao_neon.S

Comments

Martin Storsjö Feb. 11, 2021, 9:53 a.m. UTC | #1
On Thu, 4 Feb 2021, Josh Dekker wrote:

> Only works for 8x8.
>
> Signed-off-by: Josh Dekker <josh@itanimul.li>
> ---
> libavcodec/aarch64/Makefile               |  3 +-
> libavcodec/aarch64/hevcdsp_init_aarch64.c |  7 ++
> libavcodec/aarch64/hevcdsp_sao_neon.S     | 87 +++++++++++++++++++++++
> 3 files changed, 96 insertions(+), 1 deletion(-)
> create mode 100644 libavcodec/aarch64/hevcdsp_sao_neon.S
>
> diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
> index 2ea1d74a38..954461f81d 100644
> --- a/libavcodec/aarch64/Makefile
> +++ b/libavcodec/aarch64/Makefile
> @@ -62,4 +62,5 @@ NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \
>                                            aarch64/vp9mc_16bpp_neon.o          \
>                                            aarch64/vp9mc_neon.o
> NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_idct_neon.o         \
> -                                           aarch64/hevcdsp_init_aarch64.o
> +                                           aarch64/hevcdsp_init_aarch64.o      \
> +                                           aarch64/hevcdsp_sao_neon.o
> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> index fe111bd1ac..c785e46f79 100644
> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> @@ -53,6 +53,12 @@ void ff_hevc_idct_4x4_dc_10_neon(int16_t *coeffs);
> void ff_hevc_idct_8x8_dc_10_neon(int16_t *coeffs);
> void ff_hevc_idct_16x16_dc_10_neon(int16_t *coeffs);
> void ff_hevc_idct_32x32_dc_10_neon(int16_t *coeffs);
> +void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, uint8_t *_src,
> +                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
> +                                  int16_t *sao_offset_val, int sao_left_class,
> +                                  int width, int height);
> +
> +
> 
> av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
> {
> @@ -69,6 +75,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
>         c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_8_neon;
>         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_8_neon;
>         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_8_neon;
> +        c->sao_band_filter[0]          = ff_hevc_sao_band_filter_8x8_8_neon;
>     }
>     if (bit_depth == 10) {
>         c->add_residual[0]             = ff_hevc_add_residual_4x4_10_neon;
> diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S
> new file mode 100644
> index 0000000000..f142c1e8c2
> --- /dev/null
> +++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
> @@ -0,0 +1,87 @@
> +/* -*-arm64-*-
> + * vim: syntax=arm64asm
> + *
> + * AArch64 NEON optimised SAO functions for HEVC decoding
> + *
> + * Copyright (c) 2020 Josh Dekker <josh@itanimul.li>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +
> +// void sao_band_filter(uint8_t *_dst, uint8_t *_src,
> +//                      ptrdiff_t stride_dst, ptrdiff_t stride_src,
> +//                      int16_t *sao_offset_val, int sao_left_class,
> +//                      int width, int height)
> +function ff_hevc_sao_band_filter_8x8_8_neon, export=1
> +    sub sp, sp, #64
> +    stp xzr, xzr, [sp]
> +    stp xzr, xzr, [sp, #16]
> +    stp xzr, xzr, [sp, #32]
> +    stp xzr, xzr, [sp, #48]
> +    mov w8, #4
> +0:
> +    ldrsh x9, [x4, x8, lsl #1] // x9 = sao_offset_val[k+1]
> +    subs w8, w8, #1
> +    add w10, w8, w5 // x10 = k + sao_left_class
> +    and w10, w10, #0x1F
> +    strh w9, [sp, x10, lsl #1]
> +    bne 0b
> +    ld1 {v16.16b-v19.16b}, [sp], #64
> +    movi v20.8h, #1
> +1:  // beginning of line

No technical objections, it seems to build fine in all environments, and 
gives a consistent speedup over C, so that's good even if things maybe 
could be even faster. Didn't look closer at the algorithm so far. But the 
indentation is way different than all other asm, so please fix that.

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 2ea1d74a38..954461f81d 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -62,4 +62,5 @@  NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \
                                            aarch64/vp9mc_16bpp_neon.o          \
                                            aarch64/vp9mc_neon.o
 NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_idct_neon.o         \
-                                           aarch64/hevcdsp_init_aarch64.o
+                                           aarch64/hevcdsp_init_aarch64.o      \
+                                           aarch64/hevcdsp_sao_neon.o
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index fe111bd1ac..c785e46f79 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -53,6 +53,12 @@  void ff_hevc_idct_4x4_dc_10_neon(int16_t *coeffs);
 void ff_hevc_idct_8x8_dc_10_neon(int16_t *coeffs);
 void ff_hevc_idct_16x16_dc_10_neon(int16_t *coeffs);
 void ff_hevc_idct_32x32_dc_10_neon(int16_t *coeffs);
+void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, uint8_t *_src,
+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                  int16_t *sao_offset_val, int sao_left_class,
+                                  int width, int height);
+
+
 
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
@@ -69,6 +75,7 @@  av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_8_neon;
         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_8_neon;
         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_8_neon;
+        c->sao_band_filter[0]          = ff_hevc_sao_band_filter_8x8_8_neon;
     }
     if (bit_depth == 10) {
         c->add_residual[0]             = ff_hevc_add_residual_4x4_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S
new file mode 100644
index 0000000000..f142c1e8c2
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
@@ -0,0 +1,87 @@ 
+/* -*-arm64-*-
+ * vim: syntax=arm64asm
+ *
+ * AArch64 NEON optimised SAO functions for HEVC decoding
+ *
+ * Copyright (c) 2020 Josh Dekker <josh@itanimul.li>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+// void sao_band_filter(uint8_t *_dst, uint8_t *_src,
+//                      ptrdiff_t stride_dst, ptrdiff_t stride_src,
+//                      int16_t *sao_offset_val, int sao_left_class,
+//                      int width, int height)
+function ff_hevc_sao_band_filter_8x8_8_neon, export=1
+    sub sp, sp, #64
+    stp xzr, xzr, [sp]
+    stp xzr, xzr, [sp, #16]
+    stp xzr, xzr, [sp, #32]
+    stp xzr, xzr, [sp, #48]
+    mov w8, #4
+0:
+    ldrsh x9, [x4, x8, lsl #1] // x9 = sao_offset_val[k+1]
+    subs w8, w8, #1
+    add w10, w8, w5 // x10 = k + sao_left_class
+    and w10, w10, #0x1F
+    strh w9, [sp, x10, lsl #1]
+    bne 0b
+    ld1 {v16.16b-v19.16b}, [sp], #64
+    movi v20.8h, #1
+1:  // beginning of line
+    mov w8, w6
+2:
+    // Simple layout for accessing 16bit values
+    // with 8bit LUT.
+    //
+    //   00  01  02  03  04  05  06  07
+    // +----------------------------------->
+    // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|....
+    // +----------------------------------->
+    //    i-0     i-1     i-2     i-3
+    // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
+    ld1 {v2.8b}, [x1]
+    // load src[x]
+    uxtl v0.8h, v2.8b
+    // >> shift
+    ushr v2.8h, v0.8h, #3 // BIT_DEPTH - 3
+    // x2 (access lower short)
+    shl v1.8h, v2.8h, #1 // low (x2, accessing short)
+    // +1 access upper short
+    add v3.8h, v1.8h, v20.8h
+    // shift insert index to upper byte
+    sli v1.8h, v3.8h, #8
+    // table
+    tbx v2.16b, {v16.16b-v19.16b}, v1.16b
+    // src[x] + table
+    add v1.8h, v0.8h, v2.8h
+    // clip + narrow
+    sqxtun v4.8b, v1.8h
+    // store
+    st1 {v4.8b}, [x0]
+    // done 8 pixels
+    subs w8, w8, #8
+    bne 2b
+    // finished line
+    subs w7, w7, #1
+    add x0, x0, x2 // dst += stride_dst
+    add x1, x1, x3 // src += stride_src
+    bne 1b
+    ret
+endfunc