diff mbox series

[FFmpeg-devel,1/4] lavc/aarch64: add HEVC add_residual NEON

Message ID 20210107121020.86179-2-josh@itanimul.li
State New
Headers show
Series AArch64 NEON for HEVC
Related show

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Josh Dekker Jan. 7, 2021, 12:10 p.m. UTC
Signed-off-by: Josh Dekker <josh@itanimul.li>
---
 libavcodec/aarch64/Makefile               |   2 +
 libavcodec/aarch64/hevcdsp_add_res_neon.S | 298 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init.c         |  59 +++++
 libavcodec/hevcdsp.c                      |   2 +
 libavcodec/hevcdsp.h                      |   1 +
 5 files changed, 362 insertions(+)
 create mode 100644 libavcodec/aarch64/hevcdsp_add_res_neon.S
 create mode 100644 libavcodec/aarch64/hevcdsp_init.c

Comments

Martin Storsjö Jan. 16, 2021, 10:54 p.m. UTC | #1
On Thu, 7 Jan 2021, Josh Dekker wrote:

> Signed-off-by: Josh Dekker <josh@itanimul.li>
> ---
> libavcodec/aarch64/Makefile               |   2 +
> libavcodec/aarch64/hevcdsp_add_res_neon.S | 298 ++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init.c         |  59 +++++
> libavcodec/hevcdsp.c                      |   2 +
> libavcodec/hevcdsp.h                      |   1 +
> 5 files changed, 362 insertions(+)
> create mode 100644 libavcodec/aarch64/hevcdsp_add_res_neon.S
> create mode 100644 libavcodec/aarch64/hevcdsp_init.c

This one is pretty much equivalent to Reimar's patch. As his one goes on 
top of the ported IDCT, I think I'd prefer his version of it. But in any 
case, some comments below:

> diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
> index f6434e40da..4bdd554e7e 100644
> --- a/libavcodec/aarch64/Makefile
> +++ b/libavcodec/aarch64/Makefile
> @@ -17,6 +17,7 @@ OBJS-$(CONFIG_VP8DSP)                   += aarch64/vp8dsp_init_aarch64.o
> OBJS-$(CONFIG_AAC_DECODER)              += aarch64/aacpsdsp_init_aarch64.o \
>                                            aarch64/sbrdsp_init_aarch64.o
> OBJS-$(CONFIG_DCA_DECODER)              += aarch64/synth_filter_init.o
> +OBJS-$(CONFIG_HEVC_DECODER)             += aarch64/hevcdsp_init.o
> OBJS-$(CONFIG_OPUS_DECODER)             += aarch64/opusdsp_init.o
> OBJS-$(CONFIG_RV40_DECODER)             += aarch64/rv40dsp_init_aarch64.o
> OBJS-$(CONFIG_VC1DSP)                   += aarch64/vc1dsp_init_aarch64.o
> @@ -53,6 +54,7 @@ NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
> # decoders/encoders
> NEON-OBJS-$(CONFIG_AAC_DECODER)         += aarch64/aacpsdsp_neon.o
> NEON-OBJS-$(CONFIG_DCA_DECODER)         += aarch64/synth_filter_neon.o
> +NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_add_res_neon.o
> NEON-OBJS-$(CONFIG_OPUS_DECODER)        += aarch64/opusdsp_neon.o
> NEON-OBJS-$(CONFIG_VORBIS_DECODER)      += aarch64/vorbisdsp_neon.o
> NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \
> diff --git a/libavcodec/aarch64/hevcdsp_add_res_neon.S b/libavcodec/aarch64/hevcdsp_add_res_neon.S
> new file mode 100644
> index 0000000000..4005366192
> --- /dev/null
> +++ b/libavcodec/aarch64/hevcdsp_add_res_neon.S
> @@ -0,0 +1,298 @@
> +/* -*-arm64-*-
> + *
> + * AArch64 NEON optimised add residual functions for HEVC decoding
> + *
> + * Copyright (c) 2020 Josh Dekker <josh@itanimul.li>

I believe this one is at least a bit inspired by the arm version, right? 
In that case it's probably customary to bring the original copyright 
along.

> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +
> +.macro clip10 in1, in2, c1, c2
> +    smax \in1, \in1, \c1
> +    smax \in2, \in2, \c1
> +    smin \in1, \in1, \c2
> +    smin \in2, \in2, \c2
> +.endm
> +
> +function ff_hevc_add_residual_4x4_8_neon, export=1
> +    mov x3, x0

Please align the instructions and operands like in the existing assembly. 
Also, in addition to aligning those two columns, in aarch64 assembly I 
write myself, I also try to align the individual operands; for 
instructions that only take GPRs, I'd write things as "x0, x1, x2", to 
keep operands aligned the same for cases with registers >= x10, and for 
instructions that take vector registers, align everything to line up for 
the longest case register name, e.g. v31.16b. For SIMD loads, stores and 
other things, where things don't generally line up, I just try to make the 
code look pretty and consistent.

Also, as a matter of taste, I tend to write the lane specifications with 
lower case letters, i.e. .16b instead of 16B.

> +    ld1 {v0.S}[0], [x3], x2
> +    ld1 {v0.S}[1], [x3], x2
> +    ld1 {v1.S}[0], [x3], x2
> +    ld1 {v1.S}[1], [x3], x2
> +    ld1 { v2.8H-v3.8H}, [x1]
> +    ushll v4.8H, v0.8B, #0
> +    ushll v5.8H, v1.8B, #0

ushll #0 is uxtl

> +    add v6.8H, v4.8H, v2.8H
> +    add v7.8H, v5.8H, v3.8H

The arm version (and Reimar's) does saturated addition here, i.e. sqadd.

There's a bunch of other minor tuning one could suggest here, but I guess 
it applies equally to the existing arm versions (and it's unsure whether 
it does provide any measurable benefit at all), so I'll refrain from 
suggesting it here.

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index f6434e40da..4bdd554e7e 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -17,6 +17,7 @@  OBJS-$(CONFIG_VP8DSP)                   += aarch64/vp8dsp_init_aarch64.o
 OBJS-$(CONFIG_AAC_DECODER)              += aarch64/aacpsdsp_init_aarch64.o \
                                            aarch64/sbrdsp_init_aarch64.o
 OBJS-$(CONFIG_DCA_DECODER)              += aarch64/synth_filter_init.o
+OBJS-$(CONFIG_HEVC_DECODER)             += aarch64/hevcdsp_init.o
 OBJS-$(CONFIG_OPUS_DECODER)             += aarch64/opusdsp_init.o
 OBJS-$(CONFIG_RV40_DECODER)             += aarch64/rv40dsp_init_aarch64.o
 OBJS-$(CONFIG_VC1DSP)                   += aarch64/vc1dsp_init_aarch64.o
@@ -53,6 +54,7 @@  NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
 # decoders/encoders
 NEON-OBJS-$(CONFIG_AAC_DECODER)         += aarch64/aacpsdsp_neon.o
 NEON-OBJS-$(CONFIG_DCA_DECODER)         += aarch64/synth_filter_neon.o
+NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_add_res_neon.o
 NEON-OBJS-$(CONFIG_OPUS_DECODER)        += aarch64/opusdsp_neon.o
 NEON-OBJS-$(CONFIG_VORBIS_DECODER)      += aarch64/vorbisdsp_neon.o
 NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \
diff --git a/libavcodec/aarch64/hevcdsp_add_res_neon.S b/libavcodec/aarch64/hevcdsp_add_res_neon.S
new file mode 100644
index 0000000000..4005366192
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_add_res_neon.S
@@ -0,0 +1,298 @@ 
+/* -*-arm64-*-
+ *
+ * AArch64 NEON optimised add residual functions for HEVC decoding
+ *
+ * Copyright (c) 2020 Josh Dekker <josh@itanimul.li>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+.macro clip10 in1, in2, c1, c2
+    smax \in1, \in1, \c1
+    smax \in2, \in2, \c1
+    smin \in1, \in1, \c2
+    smin \in2, \in2, \c2
+.endm
+
+function ff_hevc_add_residual_4x4_8_neon, export=1
+    mov x3, x0
+    ld1 {v0.S}[0], [x3], x2
+    ld1 {v0.S}[1], [x3], x2
+    ld1 {v1.S}[0], [x3], x2
+    ld1 {v1.S}[1], [x3], x2
+    ld1 { v2.8H-v3.8H}, [x1]
+    ushll v4.8H, v0.8B, #0
+    ushll v5.8H, v1.8B, #0
+    add v6.8H, v4.8H, v2.8H
+    add v7.8H, v5.8H, v3.8H
+    sqxtun v0.8B, v6.8H
+    sqxtun v1.8B, v7.8H
+    st1 {v0.S}[0], [x0], x2
+    st1 {v0.S}[1], [x0], x2
+    st1 {v1.S}[0], [x0], x2
+    st1 {v1.S}[1], [x0], x2
+    ret
+endfunc
+
+function ff_hevc_add_residual_4x4_10_neon, export=1
+    mov x3, x0
+    movi v4.8H, #0
+    mvni v5.8H, #0xFC, lsl #8
+    ld1 {v0.D}[0], [x3], x2
+    ld1 {v0.D}[1], [x3], x2
+    ld1 {v1.D}[0], [x3], x2
+    ld1 {v1.D}[1], [x3], x2
+    ld1 { v2.8H-v3.8H}, [x1]
+    add v2.8H, v0.8H, v2.8H
+    add v3.8H, v1.8H, v3.8H
+    clip10 v2.8H, v3.8H, v4.8H, v5.8H
+    st1 {v2.D}[0], [x0], x2
+    st1 {v2.D}[1], [x0], x2
+    st1 {v3.D}[0], [x0], x2
+    st1 {v3.D}[1], [x0], x2
+    ret
+endfunc
+
+function ff_hevc_add_residual_8x8_8_neon, export=1
+    mov x3, x0
+    ld1 {v0.8B}, [x3], x2
+    ld1 {v1.8B}, [x3], x2
+    ld1 {v2.8B}, [x3], x2
+    ld1 {v3.8B}, [x3], x2
+    ld1 {v4.8B}, [x3], x2
+    ld1 {v5.8B}, [x3], x2
+    ld1 {v6.8B}, [x3], x2
+    ld1 {v7.8B}, [x3], x2
+    ld1 { v16.8H-v19.8H}, [x1], #64
+    ld1 { v20.8H-v23.8H}, [x1]
+    ushll v24.8H, v0.8B, #0
+    ushll v25.8H, v1.8B, #0
+    ushll v26.8H, v2.8B, #0
+    ushll v27.8H, v3.8B, #0
+    ushll v28.8H, v4.8B, #0
+    ushll v29.8H, v5.8B, #0
+    ushll v30.8H, v6.8B, #0
+    ushll v31.8H, v7.8B, #0
+    add v0.8H, v24.8H, v16.8H
+    add v1.8H, v25.8H, v17.8H
+    add v2.8H, v26.8H, v18.8H
+    add v3.8H, v27.8H, v19.8H
+    add v4.8H, v28.8H, v20.8H
+    add v5.8H, v29.8H, v21.8H
+    add v6.8H, v30.8H, v22.8H
+    add v7.8H, v31.8H, v23.8H
+    sqxtun v24.8B, v0.8H
+    sqxtun v25.8B, v1.8H
+    sqxtun v26.8B, v2.8H
+    sqxtun v27.8B, v3.8H
+    sqxtun v28.8B, v4.8H
+    sqxtun v29.8B, v5.8H
+    sqxtun v30.8B, v6.8H
+    sqxtun v31.8B, v7.8H
+    st1 {v24.8B}, [x0], x2
+    st1 {v25.8B}, [x0], x2
+    st1 {v26.8B}, [x0], x2
+    st1 {v27.8B}, [x0], x2
+    st1 {v28.8B}, [x0], x2
+    st1 {v29.8B}, [x0], x2
+    st1 {v30.8B}, [x0], x2
+    st1 {v31.8B}, [x0], x2
+    ret
+endfunc
+
+function ff_hevc_add_residual_8x8_10_neon, export=1
+    mov x3, x0
+    movi v0.8H, #0
+    mvni v1.8H, #0xFC, lsl #8
+    ld1 {v24.8H}, [x3], x2
+    ld1 {v25.8H}, [x3], x2
+    ld1 {v26.8H}, [x3], x2
+    ld1 {v27.8H}, [x3], x2
+    ld1 {v28.8H}, [x3], x2
+    ld1 {v29.8H}, [x3], x2
+    ld1 {v30.8H}, [x3], x2
+    ld1 {v31.8H}, [x3], x2
+    ld1 {v16.8H-v19.8H}, [x1], #64
+    ld1 {v20.8H-v23.8H}, [x1]
+    add v16.8H, v16.8H, v24.8H
+    add v17.8H, v17.8H, v25.8H
+    clip10 v16.8H, v17.8H, v0.8H, v1.8H
+    add v18.8H, v18.8H, v26.8H
+    add v19.8H, v19.8H, v27.8H
+    clip10 v18.8H, v19.8H, v0.8H, v1.8H
+    add v20.8H, v20.8H, v28.8H
+    add v21.8H, v21.8H, v29.8H
+    clip10 v20.8H, v21.8H, v0.8H, v1.8H
+    add v22.8H, v22.8H, v30.8H
+    add v23.8H, v23.8H, v31.8H
+    clip10 v22.8H, v23.8H, v0.8H, v1.8H
+    st1 {v16.8H}, [x0], x2
+    st1 {v17.8H}, [x0], x2
+    st1 {v18.8H}, [x0], x2
+    st1 {v19.8H}, [x0], x2
+    st1 {v20.8H}, [x0], x2
+    st1 {v21.8H}, [x0], x2
+    st1 {v22.8H}, [x0], x2
+    st1 {v23.8H}, [x0], x2
+    ret
+endfunc
+
+function ff_hevc_add_residual_16x16_8_neon, export=1
+    mov x3, x0
+    mov x4, #4
+1:
+    subs x4, x4, #1
+    ld1 { v0.8B- v1.8B}, [x3], x2
+    ld1 { v2.8B- v3.8B}, [x3], x2
+    ld1 { v4.8B- v5.8B}, [x3], x2
+    ld1 { v6.8B- v7.8B}, [x3], x2
+    ld1 {v16.8H-v19.8H}, [x1], #64
+    ld1 {v20.8H-v23.8H}, [x1], #64
+    ushll v24.8H, v0.8B, #0
+    ushll v25.8H, v1.8B, #0
+    ushll v26.8H, v2.8B, #0
+    ushll v27.8H, v3.8B, #0
+    ushll v28.8H, v4.8B, #0
+    ushll v29.8H, v5.8B, #0
+    ushll v30.8H, v6.8B, #0
+    ushll v31.8H, v7.8B, #0
+    add v0.8H, v24.8H, v16.8H
+    add v1.8H, v25.8H, v17.8H
+    add v2.8H, v26.8H, v18.8H
+    add v3.8H, v27.8H, v19.8H
+    add v4.8H, v28.8H, v20.8H
+    add v5.8H, v29.8H, v21.8H
+    add v6.8H, v30.8H, v22.8H
+    add v7.8H, v31.8H, v23.8H
+    sqxtun v24.8B, v0.8H
+    sqxtun v25.8B, v1.8H
+    sqxtun v26.8B, v2.8H
+    sqxtun v27.8B, v3.8H
+    sqxtun v28.8B, v4.8H
+    sqxtun v29.8B, v5.8H
+    sqxtun v30.8B, v6.8H
+    sqxtun v31.8B, v7.8H
+    st1 {v24.8B-v25.8B}, [x0], x2
+    st1 {v26.8B-v27.8B}, [x0], x2
+    st1 {v28.8B-v29.8B}, [x0], x2
+    st1 {v30.8B-v31.8B}, [x0], x2
+    b.ne 1b
+    ret
+endfunc
+
+function ff_hevc_add_residual_16x16_10_neon, export=1
+    mov x3, x0
+    mov x4, #4
+    movi v0.8H, #0
+    mvni v1.8H, #0xFC, lsl #8
+1:
+    subs x4, x4, #1
+    ld1 {v16.8H-v17.8H}, [x3], x2
+    ld1 {v18.8H-v19.8H}, [x3], x2
+    ld1 {v20.8H-v21.8H}, [x3], x2
+    ld1 {v22.8H-v23.8H}, [x3], x2
+    ld1 {v24.8H-v27.8H}, [x1], #64
+    ld1 {v28.8H-v31.8H}, [x1], #64
+    add v16.8H, v16.8H, v24.8H
+    add v17.8H, v17.8H, v25.8H
+    clip10 v16.8H, v17.8H, v0.8H, v1.8H
+    add v18.8H, v18.8H, v26.8H
+    add v19.8H, v19.8H, v27.8H
+    clip10 v18.8H, v19.8H, v0.8H, v1.8H
+    add v20.8H, v20.8H, v28.8H
+    add v21.8H, v21.8H, v29.8H
+    clip10 v20.8H, v21.8H, v0.8H, v1.8H
+    add v22.8H, v22.8H, v30.8H
+    add v23.8H, v23.8H, v31.8H
+    clip10 v22.8H, v23.8H, v0.8H, v1.8H
+    st1 {v16.8H-v17.8H}, [x0], x2
+    st1 {v18.8H-v19.8H}, [x0], x2
+    st1 {v20.8H-v21.8H}, [x0], x2
+    st1 {v22.8H-v23.8H}, [x0], x2
+    b.ne 1b
+    ret
+endfunc
+
+function ff_hevc_add_residual_32x32_8_neon, export=1
+    mov x3, x0
+    mov x4, #16
+1:
+    subs x4, x4, #1
+    ld1 { v0.8B- v3.8B}, [x3], x2
+    ld1 { v4.8B- v7.8B}, [x3], x2
+    ld1 {v16.8H-v19.8H}, [x1], #64
+    ld1 {v20.8H-v23.8H}, [x1], #64
+    ushll v24.8H, v0.8B, #0
+    ushll v25.8H, v1.8B, #0
+    ushll v26.8H, v2.8B, #0
+    ushll v27.8H, v3.8B, #0
+    ushll v28.8H, v4.8B, #0
+    ushll v29.8H, v5.8B, #0
+    ushll v30.8H, v6.8B, #0
+    ushll v31.8H, v7.8B, #0
+    add v0.8H, v24.8H, v16.8H
+    add v1.8H, v25.8H, v17.8H
+    add v2.8H, v26.8H, v18.8H
+    add v3.8H, v27.8H, v19.8H
+    add v4.8H, v28.8H, v20.8H
+    add v5.8H, v29.8H, v21.8H
+    add v6.8H, v30.8H, v22.8H
+    add v7.8H, v31.8H, v23.8H
+    sqxtun v24.8B, v0.8H
+    sqxtun v25.8B, v1.8H
+    sqxtun v26.8B, v2.8H
+    sqxtun v27.8B, v3.8H
+    sqxtun v28.8B, v4.8H
+    sqxtun v29.8B, v5.8H
+    sqxtun v30.8B, v6.8H
+    sqxtun v31.8B, v7.8H
+    st1 {v24.8B-v27.8B}, [x0], x2
+    st1 {v28.8B-v31.8B}, [x0], x2
+    b.ne 1b
+    ret
+endfunc
+
+function ff_hevc_add_residual_32x32_10_neon, export=1
+    mov x3, x0
+    mov x4, #16
+    movi v0.8H, #0
+    mvni v1.8H, #0xFC, lsl #8
+1:
+    subs x4, x4, #1
+    ld1 {v16.8H-v19.8H}, [x3], x2
+    ld1 {v20.8H-v23.8H}, [x3], x2
+    ld1 {v24.8H-v27.8H}, [x1], #64
+    ld1 {v28.8H-v31.8H}, [x1], #64
+    add v16.8H, v16.8H, v24.8H
+    add v17.8H, v17.8H, v25.8H
+    clip10 v16.8H, v17.8H, v0.8H, v1.8H
+    add v18.8H, v18.8H, v26.8H
+    add v19.8H, v19.8H, v27.8H
+    clip10 v18.8H, v19.8H, v0.8H, v1.8H
+    add v20.8H, v20.8H, v28.8H
+    add v21.8H, v21.8H, v29.8H
+    clip10 v20.8H, v21.8H, v0.8H, v1.8H
+    add v22.8H, v22.8H, v30.8H
+    add v23.8H, v23.8H, v31.8H
+    clip10 v22.8H, v23.8H, v0.8H, v1.8H
+    st1 {v16.8H-v19.8H}, [x0], x2
+    st1 {v20.8H-v23.8H}, [x0], x2
+    b.ne 1b
+    ret
+endfunc
diff --git a/libavcodec/aarch64/hevcdsp_init.c b/libavcodec/aarch64/hevcdsp_init.c
new file mode 100644
index 0000000000..f0a617ab39
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_init.c
@@ -0,0 +1,59 @@ 
+/*
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/hevcdsp.h"
+#include "libavcodec/avcodec.h"
+
+void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs,
+                                     ptrdiff_t stride);
+void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs,
+                                     ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, int16_t *coeffs,
+                                     ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, int16_t *coeffs,
+                                     ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, int16_t *coeffs,
+                                       ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, int16_t *coeffs,
+                                       ptrdiff_t stride);
+void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, int16_t *coeffs,
+                                       ptrdiff_t stride);
+void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, int16_t *coeffs,
+                                       ptrdiff_t stride);
+
+av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (have_neon(cpu_flags) && bit_depth == 8) {
+        c->add_residual[0]             = ff_hevc_add_residual_4x4_8_neon;
+        c->add_residual[1]             = ff_hevc_add_residual_8x8_8_neon;
+        c->add_residual[2]             = ff_hevc_add_residual_16x16_8_neon;
+        c->add_residual[3]             = ff_hevc_add_residual_32x32_8_neon;
+    }
+
+    if (have_neon(cpu_flags) && bit_depth == 10) {
+        c->add_residual[0]             = ff_hevc_add_residual_4x4_10_neon;
+        c->add_residual[1]             = ff_hevc_add_residual_8x8_10_neon;
+        c->add_residual[2]             = ff_hevc_add_residual_16x16_10_neon;
+        c->add_residual[3]             = ff_hevc_add_residual_32x32_10_neon;
+    }
+}
diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
index 957e40d5ff..fe272ac1ce 100644
--- a/libavcodec/hevcdsp.c
+++ b/libavcodec/hevcdsp.c
@@ -257,6 +257,8 @@  int i = 0;
         break;
     }
 
+    if (ARCH_AARCH64)
+        ff_hevc_dsp_init_aarch64(hevcdsp, bit_depth);
     if (ARCH_ARM)
         ff_hevc_dsp_init_arm(hevcdsp, bit_depth);
     if (ARCH_PPC)
diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
index c605a343d6..0e013a8328 100644
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -129,6 +129,7 @@  void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
 extern const int8_t ff_hevc_epel_filters[7][4];
 extern const int8_t ff_hevc_qpel_filters[3][16];
 
+void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth);
 void ff_hevc_dsp_init_arm(HEVCDSPContext *c, const int bit_depth);
 void ff_hevc_dsp_init_ppc(HEVCDSPContext *c, const int bit_depth);
 void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);