[FFmpeg-devel] libavcodec/hevcdsp: port SIMD idct functions from 32-bit.

Message ID	20210116010345.20401-1-Reimar.Doeffinger@gmx.de
State	New
Headers	show Return-Path: <ffmpeg-devel-bounces@ffmpeg.org> From: Reimar.Doeffinger@gmx.de To: ffmpeg-devel@ffmpeg.org Date: Sat, 16 Jan 2021 02:03:45 +0100 Message-Id: <20210116010345.20401-1-Reimar.Doeffinger@gmx.de> In-Reply-To: <alpine.DEB.2.23.453.2101160026040.4295@cone.martin.st> References: <alpine.DEB.2.23.453.2101160026040.4295@cone.martin.st> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH] libavcodec/hevcdsp: port SIMD idct functions from 32-bit. Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Cc: =?utf-8?q?Reimar_D=C3=B6ffinger?= <Reimar.Doeffinger@gmx.de> Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
Series	[FFmpeg-devel] libavcodec/hevcdsp: port SIMD idct functions from 32-bit. \| expand [FFmpeg-devel] libavcodec/hevcdsp: port SIMD idct functions from 32-bit.

Context	Check	Description
andriy/x86_make	success	Make finished
andriy/x86_make_fate	success	Make fate finished
andriy/PPC64_make	success	Make finished
andriy/PPC64_make_fate	success	Make fate finished

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index f6434e40da..2ea1d74a38 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -61,3 +61,5 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \ aarch64/vp9lpf_neon.o \ aarch64/vp9mc_16bpp_neon.o \ aarch64/vp9mc_neon.o +NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_idct_neon.o \ + aarch64/hevcdsp_init_aarch64.o diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S new file mode 100644 index 0000000000..4aac205e22 --- /dev/null +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S @@ -0,0 +1,380 @@ +/* + * ARM NEON optimised IDCT functions for HEVC decoding + * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi> + * Copyright (c) 2017 Alexandra Hájková + * + * Ported from arm/hevcdsp_idct_neon.S by + * Copyright (c) 2020 Reimar Döffinger + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +const trans, align=4 + .short 64, 83, 64, 36 + .short 89, 75, 50, 18 + .short 90, 87, 80, 70 + .short 57, 43, 25, 9 + .short 90, 90, 88, 85 + .short 82, 78, 73, 67 + .short 61, 54, 46, 38 + .short 31, 22, 13, 4 +endconst + +.macro sum_sub out, in, c, op, p + .ifc \op, + + smlal\p \out, \in, \c + .else + smlsl\p \out, \in, \c + .endif +.endm + +.macro fixsqrshrn d, dt, n, m + .ifc \dt, .8H + sqrshrn2 \d\dt, \n\().4S, \m + .else + sqrshrn \n\().4H, \n\().4S, \m + mov \d\().D[0], \n\().D[0] + .endif +.endm + +// uses and clobbers v28-v31 as temp registers +.macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, p1, p2 + sshll\p1 v28.4S, \in0, #6 + mov v29.16B, v28.16B + smull\p1 v30.4S, \in1, v0.H[1] + smull\p1 v31.4S, \in1, v0.H[3] + smlal\p2 v28.4S, \in2, v0.H[0] //e0 + smlsl\p2 v29.4S, \in2, v0.H[0] //e1 + smlal\p2 v30.4S, \in3, v0.H[3] //o0 + smlsl\p2 v31.4S, \in3, v0.H[1] //o1 + + add \out0, v28.4S, v30.4S + add \out1, v29.4S, v31.4S + sub \out2, v29.4S, v31.4S + sub \out3, v28.4S, v30.4S +.endm + +.macro transpose8_4x4 r0, r1, r2, r3 + trn1 v2.8H, \r0\().8H, \r1\().8H + trn2 v3.8H, \r0\().8H, \r1\().8H + trn1 v4.8H, \r2\().8H, \r3\().8H + trn2 v5.8H, \r2\().8H, \r3\().8H + trn1 \r0\().4S, v2.4S, v4.4S + trn2 \r2\().4S, v2.4S, v4.4S + trn1 \r1\().4S, v3.4S, v5.4S + trn2 \r3\().4S, v3.4S, v5.4S +.endm + +.macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7 + transpose8_4x4 \r0, \r1, \r2, \r3 + transpose8_4x4 \r4, \r5, \r6, \r7 +.endm + +.macro tr_8x4 shift, in0,in0t, in1,in1t, in2,in2t, in3,in3t, in4,in4t, in5,in5t, in6,in6t, in7,in7t, p1, p2 + tr_4x4_8 \in0\in0t, \in2\in2t, \in4\in4t, \in6\in6t, v24.4S, v25.4S, v26.4S, v27.4S, \p1, \p2 + + smull\p1 v30.4S, \in1\in1t, v0.H[6] + smull\p1 v28.4S, \in1\in1t, v0.H[4] + smull\p1 v29.4S, \in1\in1t, v0.H[5] + sum_sub v30.4S, \in3\in3t, v0.H[4], -, \p1 + sum_sub v28.4S, \in3\in3t, v0.H[5], +, \p1 + sum_sub v29.4S, \in3\in3t, v0.H[7], -, \p1 + + sum_sub v30.4S, \in5\in5t, v0.H[7], +, \p2 + sum_sub v28.4S, \in5\in5t, v0.H[6], +, \p2 + sum_sub v29.4S, \in5\in5t, v0.H[4], -, \p2 + + sum_sub v30.4S, \in7\in7t, v0.H[5], +, \p2 + sum_sub v28.4S, \in7\in7t, v0.H[7], +, \p2 + sum_sub v29.4S, \in7\in7t, v0.H[6], -, \p2 + + add v31.4S, v26.4S, v30.4S + sub v26.4S, v26.4S, v30.4S + fixsqrshrn \in2,\in2t, v31, \shift + + + smull\p1 v31.4S, \in1\in1t, v0.H[7] + sum_sub v31.4S, \in3\in3t, v0.H[6], -, \p1 + sum_sub v31.4S, \in5\in5t, v0.H[5], +, \p2 + sum_sub v31.4S, \in7\in7t, v0.H[4], -, \p2 + fixsqrshrn \in5,\in5t, v26, \shift + + + add v26.4S, v24.4S, v28.4S + sub v24.4S, v24.4S, v28.4S + add v28.4S, v25.4S, v29.4S + sub v25.4S, v25.4S, v29.4S + add v30.4S, v27.4S, v31.4S + sub v27.4S, v27.4S, v31.4S + + fixsqrshrn \in0,\in0t, v26, \shift + fixsqrshrn \in7,\in7t, v24, \shift + fixsqrshrn \in1,\in1t, v28, \shift + fixsqrshrn \in6,\in6t, v25, \shift + fixsqrshrn \in3,\in3t, v30, \shift + fixsqrshrn \in4,\in4t, v27, \shift +.endm + +.macro idct_8x8 bitdepth +function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1 +//x0 - coeffs + mov x1, x0 + ld1 {v16.8H-v19.8H}, [x1], #64 + ld1 {v20.8H-v23.8H}, [x1] + + movrel x1, trans + ld1 {v0.8H}, [x1] + + tr_8x4 7, v16,.4H, v17,.4H, v18,.4H, v19,.4H, v20,.4H, v21,.4H, v22,.4H, v23,.4H + tr_8x4 7, v16,.8H, v17,.8H, v18,.8H, v19,.8H, v20,.8H, v21,.8H, v22,.8H, v23,.8H, 2, 2 + + transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23 + + tr_8x4 20 - \bitdepth, v16,.4H, v17,.4H, v18,.4H, v19,.4H, v16,.8H, v17,.8H, v18,.8H, v19,.8H, , 2 + tr_8x4 20 - \bitdepth, v20,.4H, v21,.4H, v22,.4H, v23,.4H, v20,.8H, v21,.8H, v22,.8H, v23,.8H, , 2 + + transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23 + + mov x1, x0 + st1 {v16.8H-v19.8H}, [x1], #64 + st1 {v20.8H-v23.8H}, [x1] + + ret +endfunc +.endm + +.macro butterfly e, o, tmp_p, tmp_m + add \tmp_p, \e, \o + sub \tmp_m, \e, \o +.endm + +.macro tr16_8x4 in0, in1, in2, in3, offset + tr_4x4_8 \in0\().4H, \in1\().4H, \in2\().4H, \in3\().4H, v24.4S, v25.4S, v26.4S, v27.4S + + smull2 v28.4S, \in0\().8H, v0.H[4] + smull2 v29.4S, \in0\().8H, v0.H[5] + smull2 v30.4S, \in0\().8H, v0.H[6] + smull2 v31.4S, \in0\().8H, v0.H[7] + sum_sub v28.4S, \in1\().8H, v0.H[5], +, 2 + sum_sub v29.4S, \in1\().8H, v0.H[7], -, 2 + sum_sub v30.4S, \in1\().8H, v0.H[4], -, 2 + sum_sub v31.4S, \in1\().8H, v0.H[6], -, 2 + + sum_sub v28.4S, \in2\().8H, v0.H[6], +, 2 + sum_sub v29.4S, \in2\().8H, v0.H[4], -, 2 + sum_sub v30.4S, \in2\().8H, v0.H[7], +, 2 + sum_sub v31.4S, \in2\().8H, v0.H[5], +, 2 + + sum_sub v28.4S, \in3\().8H, v0.H[7], +, 2 + sum_sub v29.4S, \in3\().8H, v0.H[6], -, 2 + sum_sub v30.4S, \in3\().8H, v0.H[5], +, 2 + sum_sub v31.4S, \in3\().8H, v0.H[4], -, 2 + + butterfly v24.4S, v28.4S, v16.4S, v23.4S + butterfly v25.4S, v29.4S, v17.4S, v22.4S + butterfly v26.4S, v30.4S, v18.4S, v21.4S + butterfly v27.4S, v31.4S, v19.4S, v20.4S + add x4, sp, #\offset + st1 {v16.4S-v19.4S}, [x4], #64 + st1 {v20.4S-v23.4S}, [x4] +.endm + +.macro load16 in0, in1, in2, in3 + ld1 {\in0}[0], [x1], x2 + ld1 {\in0}[1], [x3], x2 + ld1 {\in1}[0], [x1], x2 + ld1 {\in1}[1], [x3], x2 + ld1 {\in2}[0], [x1], x2 + ld1 {\in2}[1], [x3], x2 + ld1 {\in3}[0], [x1], x2 + ld1 {\in3}[1], [x3], x2 +.endm + +.macro add_member in, t0, t1, t2, t3, t4, t5, t6, t7, op0, op1, op2, op3, op4, op5, op6, op7, p + sum_sub v21.4S, \in, \t0, \op0, \p + sum_sub v22.4S, \in, \t1, \op1, \p + sum_sub v23.4S, \in, \t2, \op2, \p + sum_sub v24.4S, \in, \t3, \op3, \p + sum_sub v25.4S, \in, \t4, \op4, \p + sum_sub v26.4S, \in, \t5, \op5, \p + sum_sub v27.4S, \in, \t6, \op6, \p + sum_sub v28.4S, \in, \t7, \op7, \p +.endm + +.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7 + add v20.4S, \in0, \in1 + sub \in0, \in0, \in1 + add \in1, \in2, \in3 + sub \in2, \in2, \in3 + add \in3, \in4, \in5 + sub \in4, \in4, \in5 + add \in5, \in6, \in7 + sub \in6, \in6, \in7 +.endm + +.macro store16 in0, in1, in2, in3, rx + st1 {\in0}[0], [x1], x2 + st1 {\in0}[1], [x3], \rx + st1 {\in1}[0], [x1], x2 + st1 {\in1}[1], [x3], \rx + st1 {\in2}[0], [x1], x2 + st1 {\in2}[1], [x3], \rx + st1 {\in3}[0], [x1], x2 + st1 {\in3}[1], [x3], \rx +.endm + +.macro scale out0, out1, out2, out3, in0, in1, in2, in3, in4, in5, in6, in7, shift + sqrshrn \out0\().4H, \in0, \shift + sqrshrn2 \out0\().8H, \in1, \shift + sqrshrn \out1\().4H, \in2, \shift + sqrshrn2 \out1\().8H, \in3, \shift + sqrshrn \out2\().4H, \in4, \shift + sqrshrn2 \out2\().8H, \in5, \shift + sqrshrn \out3\().4H, \in6, \shift + sqrshrn2 \out3\().8H, \in7, \shift +.endm + +.macro transpose16_4x4_2 r0, r1, r2, r3 + // lower halves + trn1 v2.4H, \r0\().4H, \r1\().4H + trn2 v3.4H, \r0\().4H, \r1\().4H + trn1 v4.4H, \r2\().4H, \r3\().4H + trn2 v5.4H, \r2\().4H, \r3\().4H + trn1 v6.2S, v2.2S, v4.2S + trn2 v7.2S, v2.2S, v4.2S + trn1 v2.2S, v3.2S, v5.2S + trn2 v4.2S, v3.2S, v5.2S + mov \r0\().D[0], v6.D[0] + mov \r2\().D[0], v7.D[0] + mov \r1\().D[0], v2.D[0] + mov \r3\().D[0], v4.D[0] + + // upper halves in reverse order + trn1 v2.8H, \r3\().8H, \r2\().8H + trn2 v3.8H, \r3\().8H, \r2\().8H + trn1 v4.8H, \r1\().8H, \r0\().8H + trn2 v5.8H, \r1\().8H, \r0\().8H + trn1 v6.4S, v2.4S, v4.4S + trn2 v7.4S, v2.4S, v4.4S + trn1 v2.4S, v3.4S, v5.4S + trn2 v4.4S, v3.4S, v5.4S + mov \r3\().D[1], v6.D[1] + mov \r1\().D[1], v7.D[1] + mov \r2\().D[1], v2.D[1] + mov \r0\().D[1], v4.D[1] +.endm + +.macro tr_16x4 name, shift, offset, step +function func_tr_16x4_\name + mov x1, x5 + add x3, x5, #(\step * 64) + mov x2, #(\step * 128) + load16 v16.D, v17.D, v18.D, v19.D + movrel x1, trans + ld1 {v0.8H}, [x1] + + tr16_8x4 v16, v17, v18, v19, \offset + + add x1, x5, #(\step * 32) + add x3, x5, #(\step * 3 *32) + mov x2, #(\step * 128) + load16 v20.D, v17.D, v18.D, v19.D + movrel x1, trans, 16 + ld1 {v1.8H}, [x1] + smull v21.4S, v20.4H, v1.H[0] + smull v22.4S, v20.4H, v1.H[1] + smull v23.4S, v20.4H, v1.H[2] + smull v24.4S, v20.4H, v1.H[3] + smull v25.4S, v20.4H, v1.H[4] + smull v26.4S, v20.4H, v1.H[5] + smull v27.4S, v20.4H, v1.H[6] + smull v28.4S, v20.4H, v1.H[7] + + add_member v20.8H, v1.H[1], v1.H[4], v1.H[7], v1.H[5], v1.H[2], v1.H[0], v1.H[3], v1.H[6], +, +, +, -, -, -, -, -, 2 + add_member v17.4H, v1.H[2], v1.H[7], v1.H[3], v1.H[1], v1.H[6], v1.H[4], v1.H[0], v1.H[5], +, +, -, -, -, +, +, + + add_member v17.8H, v1.H[3], v1.H[5], v1.H[1], v1.H[7], v1.H[0], v1.H[6], v1.H[2], v1.H[4], +, -, -, +, +, +, -, -, 2 + add_member v18.4H, v1.H[4], v1.H[2], v1.H[6], v1.H[0], v1.H[7], v1.H[1], v1.H[5], v1.H[3], +, -, -, +, -, -, +, + + add_member v18.8H, v1.H[5], v1.H[0], v1.H[4], v1.H[6], v1.H[1], v1.H[3], v1.H[7], v1.H[2], +, -, +, +, -, +, +, -, 2 + add_member v19.4H, v1.H[6], v1.H[3], v1.H[0], v1.H[2], v1.H[5], v1.H[7], v1.H[4], v1.H[1], +, -, +, -, +, +, -, + + add_member v19.8H, v1.H[7], v1.H[6], v1.H[5], v1.H[4], v1.H[3], v1.H[2], v1.H[1], v1.H[0], +, -, +, -, +, -, +, -, 2 + + add x4, sp, #\offset + ld1 {v16.4S-v19.4S}, [x4], #64 + + butterfly16 v16.4S, v21.4S, v17.4S, v22.4S, v18.4S, v23.4S, v19.4S, v24.4S + scale v29, v30, v31, v24, v20.4S, v16.4S, v21.4S, v17.4S, v22.4S, v18.4S, v23.4S, v19.4S, \shift + transpose16_4x4_2 v29, v30, v31, v24 + mov x1, x6 + add x3, x6, #(24 +3*32) + mov x2, #32 + mov x4, #-32 + store16 v29.D, v30.D, v31.D, v24.D, x4 + + add x4, sp, #(\offset + 64) + ld1 {v16.4S-v19.4S}, [x4] + butterfly16 v16.4S, v25.4S, v17.4S, v26.4S, v18.4S, v27.4S, v19.4S, v28.4S + scale v29, v30, v31, v20, v20.4S, v16.4S, v25.4S, v17.4S, v26.4S, v18.4S, v27.4S, v19.4S, \shift + transpose16_4x4_2 v29, v30, v31, v20 + + add x1, x6, #8 + add x3, x6, #(16 + 3 * 32) + mov x2, #32 + mov x4, #-32 + store16 v29.D, v30.D, v31.D, v20.D, x4 + + ret +endfunc +.endm + +.macro idct_16x16 bitdepth +function ff_hevc_idct_16x16_\bitdepth\()_neon, export=1 +//r0 - coeffs + mov x15, lr + + // allocate a temp buffer + sub sp, sp, #640 + +.irp i, 0, 1, 2, 3 + add x5, x0, #(8 * \i) + add x6, sp, #(8 * \i * 16) + bl func_tr_16x4_firstpass +.endr + +.irp i, 0, 1, 2, 3 + add x5, sp, #(8 * \i) + add x6, x0, #(8 * \i * 16) + bl func_tr_16x4_secondpass_\bitdepth +.endr + + add sp, sp, #640 + + mov lr, x15 + ret +endfunc +.endm + +idct_8x8 8 +idct_8x8 10 + +tr_16x4 firstpass, 7, 512, 1 +tr_16x4 secondpass_8, 20 - 8, 512, 1 +tr_16x4 secondpass_10, 20 - 10, 512, 1 + +idct_16x16 8 +idct_16x16 10 diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c new file mode 100644 index 0000000000..19d9a7f9ed --- /dev/null +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2020 Reimar Döffinger + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/aarch64/cpu.h" +#include "libavcodec/hevcdsp.h" + +void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit); +void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit); +void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit); +void ff_hevc_idct_16x16_10_neon(int16_t *coeffs, int col_limit); + +av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) +{ + if (!have_neon(av_get_cpu_flags())) return; + + if (bit_depth == 8) { + c->idct[1] = ff_hevc_idct_8x8_8_neon; + c->idct[2] = ff_hevc_idct_16x16_8_neon; + } + if (bit_depth == 10) { + c->idct[1] = ff_hevc_idct_8x8_10_neon; + c->idct[2] = ff_hevc_idct_16x16_10_neon; + } +} diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c index 957e40d5ff..fe272ac1ce 100644 --- a/libavcodec/hevcdsp.c +++ b/libavcodec/hevcdsp.c @@ -257,6 +257,8 @@ int i = 0; break; } + if (ARCH_AARCH64) + ff_hevc_dsp_init_aarch64(hevcdsp, bit_depth); if (ARCH_ARM) ff_hevc_dsp_init_arm(hevcdsp, bit_depth); if (ARCH_PPC) diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h index c605a343d6..0e013a8328 100644 --- a/libavcodec/hevcdsp.h +++ b/libavcodec/hevcdsp.h @@ -129,6 +129,7 @@ void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth); extern const int8_t ff_hevc_epel_filters[7][4]; extern const int8_t ff_hevc_qpel_filters[3][16]; +void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth); void ff_hevc_dsp_init_arm(HEVCDSPContext *c, const int bit_depth); void ff_hevc_dsp_init_ppc(HEVCDSPContext *c, const int bit_depth); void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);

[FFmpeg-devel] libavcodec/hevcdsp: port SIMD idct functions from 32-bit.

Checks

Commit Message

Patch