Message ID | 20230329141346.3718-2-jdek@itanimul.li |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,v4,1/2] checkasm: add hevc_deblock chroma test | expand |
Context | Check | Description |
---|---|---|
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
On Wed, 29 Mar 2023, J. Dekker wrote: > Benched on Ampere Altra: > > hevc_h_loop_filter_chroma8_c: 367.7 > hevc_h_loop_filter_chroma8_neon: 31.0 > hevc_h_loop_filter_chroma10_c: 396.7 > hevc_h_loop_filter_chroma10_neon: 27.5 > hevc_h_loop_filter_chroma12_c: 377.0 > hevc_h_loop_filter_chroma12_neon: 31.7 > hevc_v_loop_filter_chroma8_c: 369.0 > hevc_v_loop_filter_chroma8_neon: 55.0 > hevc_v_loop_filter_chroma10_c: 389.0 > hevc_v_loop_filter_chroma10_neon: 54.0 > hevc_v_loop_filter_chroma12_c: 389.5 > hevc_v_loop_filter_chroma12_neon: 53.0 > > Signed-off-by: J. Dekker <jdek@itanimul.li> > --- > > Included Martin's comments, decent speedup on vertical filter (~50%). > > libavcodec/aarch64/Makefile | 3 +- > libavcodec/aarch64/hevcdsp_deblock_neon.S | 180 ++++++++++++++++++++++ > libavcodec/aarch64/hevcdsp_init_aarch64.c | 18 +++ > 3 files changed, 200 insertions(+), 1 deletion(-) > create mode 100644 libavcodec/aarch64/hevcdsp_deblock_neon.S > > diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile > index 02fb51c3ab..216191640c 100644 > --- a/libavcodec/aarch64/Makefile > +++ b/libavcodec/aarch64/Makefile > @@ -65,7 +65,8 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \ > aarch64/vp9lpf_neon.o \ > aarch64/vp9mc_16bpp_neon.o \ > aarch64/vp9mc_neon.o > -NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_idct_neon.o \ > +NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_deblock_neon.o \ > + aarch64/hevcdsp_idct_neon.o \ > aarch64/hevcdsp_init_aarch64.o \ > aarch64/hevcdsp_qpel_neon.o \ > aarch64/hevcdsp_sao_neon.o > diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S > new file mode 100644 > index 0000000000..ed342e5ded > --- /dev/null > +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S > @@ -0,0 +1,180 @@ > +/* -*-arm64-*- > + * vim: syntax=arm64asm > + * > + * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi> > + * Copyright (c) 2023 J. Dekker <jdek@itanimul.li> > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > + */ > + > + > +#include "libavutil/aarch64/asm.S" > +#include "neon.S" > + > +.macro hevc_loop_filter_chroma_start bitdepth > + mov x4, x30 > + ldr w14, [x2] > + ldr w15, [x2, #4] > +.if \bitdepth > 8 > + lsl w14, w14, #(\bitdepth - 8) > + lsl w15, w15, #(\bitdepth - 8) > +.endif > + adds w2, w14, w15 > + b.eq 1f > + dup v16.4h, w14 > + dup v17.4h, w15 > + trn1 v16.2d, v16.2d, v17.2d > +.if \bitdepth > 8 > + mvni v19.8h, #((0xff << (\bitdepth - 8)) & 0xff), lsl #8 > + movi v18.8h, #0 > +.endif > + neg v17.8h, v16.8h > +.endm > + > +.macro hevc_loop_filter_chroma_body bitdepth > +.if \bitdepth <= 8 > + uxtl v20.8h, v0.8b // p1 > + uxtl v1.8h, v1.8b // p0 > + uxtl v2.8h, v2.8b // q0 > + uxtl v23.8h, v3.8b // q1 > + va .req v20 > + vb .req v23 > +.else // required to specify both cases as we are unable to do: v0 .req v20 > + va .req v0 > + vb .req v3 > +.endif > + sub v5.8h, v2.8h, v1.8h // q0 - p0 > + sub v6.8h, va.8h, vb.8h // p1 - q1 > + shl v5.8h, v5.8h, #2 > + add v5.8h, v6.8h, v5.8h > + srshr v5.8h, v5.8h, #3 > + clip v17.8h, v16.8h, v5.8h > + sqadd v1.8h, v1.8h, v5.8h // p0 + delta > + sqsub v2.8h, v2.8h, v5.8h // q0 - delta > +.if \bitdepth <= 8 > + sqxtun v1.8b, v1.8h > + sqxtun v2.8b, v2.8h > +.unreq va > +.unreq vb Shouldn't the .unreq be outside of the .if/.else? > +.else > + clip v18.8h, v19.8h, v1.8h, v2.8h > +.endif > +.endm > + > +function hevc_loop_filter_chroma_body_8_neon, export=0 > + hevc_loop_filter_chroma_body 8 > + ret > +endfunc > + > +function hevc_loop_filter_chroma_body_10_neon, export=0 > +hevc_loop_filter_chroma_body_12_neon: > + hevc_loop_filter_chroma_body 10 > + ret > +endfunc > + > +// void ff_hevc_h_loop_filter_chroma_8_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); > + > +.macro hevc_h_loop_filter_chroma bitdepth > +function ff_hevc_h_loop_filter_chroma_\bitdepth\()_neon, export=1 > + hevc_loop_filter_chroma_start \bitdepth > + sub x0, x0, x1, lsl #1 > +.if \bitdepth > 8 > + ld1 {v0.8h}, [x0], x1 > + ld1 {v1.8h}, [x0], x1 > + ld1 {v2.8h}, [x0], x1 > + ld1 {v3.8h}, [x0] > +.else > + ld1 {v0.8b}, [x0], x1 > + ld1 {v1.8b}, [x0], x1 > + ld1 {v2.8b}, [x0], x1 > + ld1 {v3.8b}, [x0] > +.endif > + sub x0, x0, x1, lsl #1 > + bl hevc_loop_filter_chroma_body_\bitdepth\()_neon > +.if \bitdepth > 8 > + st1 {v1.8h}, [x0], x1 > + st1 {v2.8h}, [x0] > +.else > + st1 {v1.8b}, [x0], x1 > + st1 {v2.8b}, [x0] > +.endif > +1: ret x4 > +endfunc > +.endm > + > +.macro hevc_v_loop_filter_chroma bitdepth > +function ff_hevc_v_loop_filter_chroma_\bitdepth\()_neon, export=1 > + hevc_loop_filter_chroma_start \bitdepth > + sub x0, x0, #((0x5200 >> \bitdepth) & 0x6) // high -> 4, low -> 2 TBH, I think this is rather obfuscated - I'd prefer to just move the sub (and the two instructions inbetween) back inside of the .if/.else, to have the sub instruction say more explicitly exactly what it does. Other than that, this patch LGTM now. // Martin
On Wed, Mar 29, 2023 at 11:29:09PM +0300, Martin Storsjö wrote: > On Wed, 29 Mar 2023, J. Dekker wrote: > > > Benched on Ampere Altra: > > > > hevc_h_loop_filter_chroma8_c: 367.7 > > hevc_h_loop_filter_chroma8_neon: 31.0 > > hevc_h_loop_filter_chroma10_c: 396.7 > > hevc_h_loop_filter_chroma10_neon: 27.5 > > hevc_h_loop_filter_chroma12_c: 377.0 > > hevc_h_loop_filter_chroma12_neon: 31.7 > > hevc_v_loop_filter_chroma8_c: 369.0 > > hevc_v_loop_filter_chroma8_neon: 55.0 > > hevc_v_loop_filter_chroma10_c: 389.0 > > hevc_v_loop_filter_chroma10_neon: 54.0 > > hevc_v_loop_filter_chroma12_c: 389.5 > > hevc_v_loop_filter_chroma12_neon: 53.0 > > > > Signed-off-by: J. Dekker <jdek@itanimul.li> > > --- > > > > Included Martin's comments, decent speedup on vertical filter (~50%). > > > > libavcodec/aarch64/Makefile | 3 +- > > libavcodec/aarch64/hevcdsp_deblock_neon.S | 180 ++++++++++++++++++++++ > > libavcodec/aarch64/hevcdsp_init_aarch64.c | 18 +++ > > 3 files changed, 200 insertions(+), 1 deletion(-) > > create mode 100644 libavcodec/aarch64/hevcdsp_deblock_neon.S > > > > diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile > > index 02fb51c3ab..216191640c 100644 > > --- a/libavcodec/aarch64/Makefile > > +++ b/libavcodec/aarch64/Makefile > > @@ -65,7 +65,8 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \ > > aarch64/vp9lpf_neon.o \ > > aarch64/vp9mc_16bpp_neon.o \ > > aarch64/vp9mc_neon.o > > -NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_idct_neon.o \ > > +NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_deblock_neon.o \ > > + aarch64/hevcdsp_idct_neon.o \ > > aarch64/hevcdsp_init_aarch64.o \ > > aarch64/hevcdsp_qpel_neon.o \ > > aarch64/hevcdsp_sao_neon.o > > diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S > > new file mode 100644 > > index 0000000000..ed342e5ded > > --- /dev/null > > +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S > > @@ -0,0 +1,180 @@ > > +/* -*-arm64-*- > > + * vim: syntax=arm64asm > > + * > > + * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi> > > + * Copyright (c) 2023 J. Dekker <jdek@itanimul.li> > > + * > > + * This file is part of FFmpeg. > > + * > > + * FFmpeg is free software; you can redistribute it and/or > > + * modify it under the terms of the GNU Lesser General Public > > + * License as published by the Free Software Foundation; either > > + * version 2.1 of the License, or (at your option) any later version. > > + * > > + * FFmpeg is distributed in the hope that it will be useful, > > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + * Lesser General Public License for more details. > > + * > > + * You should have received a copy of the GNU Lesser General Public > > + * License along with FFmpeg; if not, write to the Free Software > > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > > + */ > > + > > + > > +#include "libavutil/aarch64/asm.S" > > +#include "neon.S" > > + > > +.macro hevc_loop_filter_chroma_start bitdepth > > + mov x4, x30 > > + ldr w14, [x2] > > + ldr w15, [x2, #4] > > +.if \bitdepth > 8 > > + lsl w14, w14, #(\bitdepth - 8) > > + lsl w15, w15, #(\bitdepth - 8) > > +.endif > > + adds w2, w14, w15 > > + b.eq 1f > > + dup v16.4h, w14 > > + dup v17.4h, w15 > > + trn1 v16.2d, v16.2d, v17.2d > > +.if \bitdepth > 8 > > + mvni v19.8h, #((0xff << (\bitdepth - 8)) & 0xff), lsl #8 > > + movi v18.8h, #0 > > +.endif > > + neg v17.8h, v16.8h > > +.endm > > + > > +.macro hevc_loop_filter_chroma_body bitdepth > > +.if \bitdepth <= 8 > > + uxtl v20.8h, v0.8b // p1 > > + uxtl v1.8h, v1.8b // p0 > > + uxtl v2.8h, v2.8b // q0 > > + uxtl v23.8h, v3.8b // q1 > > + va .req v20 > > + vb .req v23 > > +.else // required to specify both cases as we are unable to do: v0 .req v20 > > + va .req v0 > > + vb .req v3 > > +.endif > > + sub v5.8h, v2.8h, v1.8h // q0 - p0 > > + sub v6.8h, va.8h, vb.8h // p1 - q1 > > + shl v5.8h, v5.8h, #2 > > + add v5.8h, v6.8h, v5.8h > > + srshr v5.8h, v5.8h, #3 > > + clip v17.8h, v16.8h, v5.8h > > + sqadd v1.8h, v1.8h, v5.8h // p0 + delta > > + sqsub v2.8h, v2.8h, v5.8h // q0 - delta > > +.if \bitdepth <= 8 > > + sqxtun v1.8b, v1.8h > > + sqxtun v2.8b, v2.8h > > +.unreq va > > +.unreq vb > > Shouldn't the .unreq be outside of the .if/.else? > > > +.else > > + clip v18.8h, v19.8h, v1.8h, v2.8h > > +.endif > > +.endm > > + > > +function hevc_loop_filter_chroma_body_8_neon, export=0 > > + hevc_loop_filter_chroma_body 8 > > + ret > > +endfunc > > + > > +function hevc_loop_filter_chroma_body_10_neon, export=0 > > +hevc_loop_filter_chroma_body_12_neon: > > + hevc_loop_filter_chroma_body 10 > > + ret > > +endfunc > > + > > +// void ff_hevc_h_loop_filter_chroma_8_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); > > + > > +.macro hevc_h_loop_filter_chroma bitdepth > > +function ff_hevc_h_loop_filter_chroma_\bitdepth\()_neon, export=1 > > + hevc_loop_filter_chroma_start \bitdepth > > + sub x0, x0, x1, lsl #1 > > +.if \bitdepth > 8 > > + ld1 {v0.8h}, [x0], x1 > > + ld1 {v1.8h}, [x0], x1 > > + ld1 {v2.8h}, [x0], x1 > > + ld1 {v3.8h}, [x0] > > +.else > > + ld1 {v0.8b}, [x0], x1 > > + ld1 {v1.8b}, [x0], x1 > > + ld1 {v2.8b}, [x0], x1 > > + ld1 {v3.8b}, [x0] > > +.endif > > + sub x0, x0, x1, lsl #1 > > + bl hevc_loop_filter_chroma_body_\bitdepth\()_neon > > +.if \bitdepth > 8 > > + st1 {v1.8h}, [x0], x1 > > + st1 {v2.8h}, [x0] > > +.else > > + st1 {v1.8b}, [x0], x1 > > + st1 {v2.8b}, [x0] > > +.endif > > +1: ret x4 > > +endfunc > > +.endm > > + > > +.macro hevc_v_loop_filter_chroma bitdepth > > +function ff_hevc_v_loop_filter_chroma_\bitdepth\()_neon, export=1 > > + hevc_loop_filter_chroma_start \bitdepth > > + sub x0, x0, #((0x5200 >> \bitdepth) & 0x6) // high -> 4, low -> 2 > > TBH, I think this is rather obfuscated - I'd prefer to just move the sub > (and the two instructions inbetween) back inside of the .if/.else, to have > the sub instruction say more explicitly exactly what it does. > > Other than that, this patch LGTM now. > Thanks, pushed with changes.
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index 02fb51c3ab..216191640c 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -65,7 +65,8 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \ aarch64/vp9lpf_neon.o \ aarch64/vp9mc_16bpp_neon.o \ aarch64/vp9mc_neon.o -NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_idct_neon.o \ +NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_deblock_neon.o \ + aarch64/hevcdsp_idct_neon.o \ aarch64/hevcdsp_init_aarch64.o \ aarch64/hevcdsp_qpel_neon.o \ aarch64/hevcdsp_sao_neon.o diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S new file mode 100644 index 0000000000..ed342e5ded --- /dev/null +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S @@ -0,0 +1,180 @@ +/* -*-arm64-*- + * vim: syntax=arm64asm + * + * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi> + * Copyright (c) 2023 J. Dekker <jdek@itanimul.li> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + + +#include "libavutil/aarch64/asm.S" +#include "neon.S" + +.macro hevc_loop_filter_chroma_start bitdepth + mov x4, x30 + ldr w14, [x2] + ldr w15, [x2, #4] +.if \bitdepth > 8 + lsl w14, w14, #(\bitdepth - 8) + lsl w15, w15, #(\bitdepth - 8) +.endif + adds w2, w14, w15 + b.eq 1f + dup v16.4h, w14 + dup v17.4h, w15 + trn1 v16.2d, v16.2d, v17.2d +.if \bitdepth > 8 + mvni v19.8h, #((0xff << (\bitdepth - 8)) & 0xff), lsl #8 + movi v18.8h, #0 +.endif + neg v17.8h, v16.8h +.endm + +.macro hevc_loop_filter_chroma_body bitdepth +.if \bitdepth <= 8 + uxtl v20.8h, v0.8b // p1 + uxtl v1.8h, v1.8b // p0 + uxtl v2.8h, v2.8b // q0 + uxtl v23.8h, v3.8b // q1 + va .req v20 + vb .req v23 +.else // required to specify both cases as we are unable to do: v0 .req v20 + va .req v0 + vb .req v3 +.endif + sub v5.8h, v2.8h, v1.8h // q0 - p0 + sub v6.8h, va.8h, vb.8h // p1 - q1 + shl v5.8h, v5.8h, #2 + add v5.8h, v6.8h, v5.8h + srshr v5.8h, v5.8h, #3 + clip v17.8h, v16.8h, v5.8h + sqadd v1.8h, v1.8h, v5.8h // p0 + delta + sqsub v2.8h, v2.8h, v5.8h // q0 - delta +.if \bitdepth <= 8 + sqxtun v1.8b, v1.8h + sqxtun v2.8b, v2.8h +.unreq va +.unreq vb +.else + clip v18.8h, v19.8h, v1.8h, v2.8h +.endif +.endm + +function hevc_loop_filter_chroma_body_8_neon, export=0 + hevc_loop_filter_chroma_body 8 + ret +endfunc + +function hevc_loop_filter_chroma_body_10_neon, export=0 +hevc_loop_filter_chroma_body_12_neon: + hevc_loop_filter_chroma_body 10 + ret +endfunc + +// void ff_hevc_h_loop_filter_chroma_8_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); + +.macro hevc_h_loop_filter_chroma bitdepth +function ff_hevc_h_loop_filter_chroma_\bitdepth\()_neon, export=1 + hevc_loop_filter_chroma_start \bitdepth + sub x0, x0, x1, lsl #1 +.if \bitdepth > 8 + ld1 {v0.8h}, [x0], x1 + ld1 {v1.8h}, [x0], x1 + ld1 {v2.8h}, [x0], x1 + ld1 {v3.8h}, [x0] +.else + ld1 {v0.8b}, [x0], x1 + ld1 {v1.8b}, [x0], x1 + ld1 {v2.8b}, [x0], x1 + ld1 {v3.8b}, [x0] +.endif + sub x0, x0, x1, lsl #1 + bl hevc_loop_filter_chroma_body_\bitdepth\()_neon +.if \bitdepth > 8 + st1 {v1.8h}, [x0], x1 + st1 {v2.8h}, [x0] +.else + st1 {v1.8b}, [x0], x1 + st1 {v2.8b}, [x0] +.endif +1: ret x4 +endfunc +.endm + +.macro hevc_v_loop_filter_chroma bitdepth +function ff_hevc_v_loop_filter_chroma_\bitdepth\()_neon, export=1 + hevc_loop_filter_chroma_start \bitdepth + sub x0, x0, #((0x5200 >> \bitdepth) & 0x6) // high -> 4, low -> 2 + add x3, x0, x1 + lsl x1, x1, #1 +.if \bitdepth > 8 + ld1 {v0.d}[0], [x0], x1 + ld1 {v1.d}[0], [x3], x1 + ld1 {v2.d}[0], [x0], x1 + ld1 {v3.d}[0], [x3], x1 + ld1 {v0.d}[1], [x0], x1 + ld1 {v1.d}[1], [x3], x1 + ld1 {v2.d}[1], [x0], x1 + ld1 {v3.d}[1], [x3], x1 + transpose_4x8H v0, v1, v2, v3, v28, v29, v30, v31 +.else + ld1 {v0.s}[0], [x0], x1 + ld1 {v1.s}[0], [x3], x1 + ld1 {v2.s}[0], [x0], x1 + ld1 {v3.s}[0], [x3], x1 + ld1 {v0.s}[1], [x0], x1 + ld1 {v1.s}[1], [x3], x1 + ld1 {v2.s}[1], [x0], x1 + ld1 {v3.s}[1], [x3], x1 + transpose_4x8B v0, v1, v2, v3, v28, v29, v30, v31 +.endif + sub x0, x0, x1, lsl #2 + sub x3, x3, x1, lsl #2 + bl hevc_loop_filter_chroma_body_\bitdepth\()_neon +.if \bitdepth > 8 + transpose_4x8H v0, v1, v2, v3, v28, v29, v30, v31 + st1 {v0.d}[0], [x0], x1 + st1 {v1.d}[0], [x3], x1 + st1 {v2.d}[0], [x0], x1 + st1 {v3.d}[0], [x3], x1 + st1 {v0.d}[1], [x0], x1 + st1 {v1.d}[1], [x3], x1 + st1 {v2.d}[1], [x0], x1 + st1 {v3.d}[1], [x3] +.else + transpose_4x8B v0, v1, v2, v3, v28, v29, v30, v31 + st1 {v0.s}[0], [x0], x1 + st1 {v1.s}[0], [x3], x1 + st1 {v2.s}[0], [x0], x1 + st1 {v3.s}[0], [x3], x1 + st1 {v0.s}[1], [x0], x1 + st1 {v1.s}[1], [x3], x1 + st1 {v2.s}[1], [x0], x1 + st1 {v3.s}[1], [x3] +.endif +1: ret x4 +endfunc +.endm + +hevc_h_loop_filter_chroma 8 +hevc_h_loop_filter_chroma 10 +hevc_h_loop_filter_chroma 12 + +hevc_v_loop_filter_chroma 8 +hevc_v_loop_filter_chroma 10 +hevc_v_loop_filter_chroma 12 diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 1deefca0a2..a923bae35c 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -25,6 +25,18 @@ #include "libavutil/aarch64/cpu.h" #include "libavcodec/hevcdsp.h" +void ff_hevc_v_loop_filter_chroma_8_neon(uint8_t *_pix, ptrdiff_t _stride, + const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q); +void ff_hevc_v_loop_filter_chroma_10_neon(uint8_t *_pix, ptrdiff_t _stride, + const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q); +void ff_hevc_v_loop_filter_chroma_12_neon(uint8_t *_pix, ptrdiff_t _stride, + const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q); +void ff_hevc_h_loop_filter_chroma_8_neon(uint8_t *_pix, ptrdiff_t _stride, + const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q); +void ff_hevc_h_loop_filter_chroma_10_neon(uint8_t *_pix, ptrdiff_t _stride, + const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q); +void ff_hevc_h_loop_filter_chroma_12_neon(uint8_t *_pix, ptrdiff_t _stride, + const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q); void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride); void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, const int16_t *coeffs, @@ -117,6 +129,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) if (!have_neon(av_get_cpu_flags())) return; if (bit_depth == 8) { + c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_neon; + c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_neon; c->add_residual[0] = ff_hevc_add_residual_4x4_8_neon; c->add_residual[1] = ff_hevc_add_residual_8x8_8_neon; c->add_residual[2] = ff_hevc_add_residual_16x16_8_neon; @@ -167,6 +181,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_qpel_bi_h16_8_neon; } if (bit_depth == 10) { + c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_neon; + c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_neon; c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon; c->add_residual[1] = ff_hevc_add_residual_8x8_10_neon; c->add_residual[2] = ff_hevc_add_residual_16x16_10_neon; @@ -180,6 +196,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_neon; } if (bit_depth == 12) { + c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_neon; + c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_neon; c->add_residual[0] = ff_hevc_add_residual_4x4_12_neon; c->add_residual[1] = ff_hevc_add_residual_8x8_12_neon; c->add_residual[2] = ff_hevc_add_residual_16x16_12_neon;
Benched on Ampere Altra: hevc_h_loop_filter_chroma8_c: 367.7 hevc_h_loop_filter_chroma8_neon: 31.0 hevc_h_loop_filter_chroma10_c: 396.7 hevc_h_loop_filter_chroma10_neon: 27.5 hevc_h_loop_filter_chroma12_c: 377.0 hevc_h_loop_filter_chroma12_neon: 31.7 hevc_v_loop_filter_chroma8_c: 369.0 hevc_v_loop_filter_chroma8_neon: 55.0 hevc_v_loop_filter_chroma10_c: 389.0 hevc_v_loop_filter_chroma10_neon: 54.0 hevc_v_loop_filter_chroma12_c: 389.5 hevc_v_loop_filter_chroma12_neon: 53.0 Signed-off-by: J. Dekker <jdek@itanimul.li> --- Included Martin's comments, decent speedup on vertical filter (~50%). libavcodec/aarch64/Makefile | 3 +- libavcodec/aarch64/hevcdsp_deblock_neon.S | 180 ++++++++++++++++++++++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 18 +++ 3 files changed, 200 insertions(+), 1 deletion(-) create mode 100644 libavcodec/aarch64/hevcdsp_deblock_neon.S