[FFmpeg-devel,2/2] aarch64/mdct15: implement a WIP fft15 NEON implementation

Message ID	LcsIFXU--3-1@lynne.ee
State	New
Headers	show Return-Path: <ffmpeg-devel-bounces@ffmpeg.org> Date: Sat, 20 Apr 2019 03:12:14 +0200 (CEST) From: Lynne <dev@lynne.ee> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Message-ID: <LcsIFXU--3-1@lynne.ee> In-Reply-To: <LcsHrEO--3-1@lynne.ee-LcsHvIy----1> References: <LcsHrEO--3-1@lynne.ee-LcsHvIy----1> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="----=_Part_45535_1989321391.1555722734168" Subject: [FFmpeg-devel] [PATCH 2/2] aarch64/mdct15: implement a WIP fft15 NEON implementation Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

From 534dd20b4961c2f0c4131fbbadac5f1940bad3a0 Mon Sep 17 00:00:00 2001 From: Lynne <dev@lynne.ee> Date: Sat, 20 Apr 2019 02:05:23 +0100 Subject: [PATCH 2/2] aarch64/mdct15: implement a WIP fft15 NEON implementation NEON: 5225 UNITS in fft15, 65536 runs, 0 skips C: 7889 UNITS in fft15, 65535 runs, 1 skips Posting this to maybe get some advice, its too slow for any noticeable decoding speed increase. --- libavcodec/aarch64/Makefile | 2 + libavcodec/aarch64/mdct15_init.c | 86 +++++++++++++++++++ libavcodec/aarch64/mdct15_neon.S | 136 +++++++++++++++++++++++++++++++ libavcodec/mdct15.c | 3 + libavcodec/mdct15.h | 1 + 5 files changed, 228 insertions(+) create mode 100644 libavcodec/aarch64/mdct15_init.c create mode 100644 libavcodec/aarch64/mdct15_neon.S diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index 00f93bf59f..8093fc17cb 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -6,6 +6,7 @@ OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o +OBJS-$(CONFIG_MDCT15) += aarch64/mdct15_init.o OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp_init.o @@ -43,6 +44,7 @@ NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \ NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_init_aarch64.o \ aarch64/simple_idct_neon.o +NEON-OBJS-$(CONFIG_MDCT15) += aarch64/mdct15_neon.o NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o diff --git a/libavcodec/aarch64/mdct15_init.c b/libavcodec/aarch64/mdct15_init.c new file mode 100644 index 0000000000..a4e24c9c13 --- /dev/null +++ b/libavcodec/aarch64/mdct15_init.c @@ -0,0 +1,86 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/aarch64/cpu.h" +#include "libavcodec/mdct15.h" + +void ff_fft15_neon(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride); + +static void perm_twiddles(MDCT15Context *s) +{ + int k; + FFTComplex tmp[30]; + + /* 5-point FFT twiddles */ + s->exptab[60].re = s->exptab[19].re; + s->exptab[60].im = s->exptab[19].im; + s->exptab[61].re = -s->exptab[20].re; + s->exptab[61].im = s->exptab[20].im; + s->exptab[62].re = s->exptab[20].im; + s->exptab[62].im = s->exptab[20].im; + s->exptab[63].re = -s->exptab[20].im; + s->exptab[63].im = -s->exptab[20].im; + + /* 15-point FFT twiddles */ + for (k = 0; k < 5; k++) { + tmp[6*k + 0] = s->exptab[k + 0]; + tmp[6*k + 2] = s->exptab[k + 5]; + tmp[6*k + 4] = s->exptab[k + 10]; + + tmp[6*k + 1] = s->exptab[2 * (k + 0)]; + tmp[6*k + 3] = s->exptab[2 * (k + 5)]; + tmp[6*k + 5] = s->exptab[2 * k + 5 ]; + } + + for (k = 0; k < 6; k++) { + FFTComplex ac_exp[] = { + { tmp[6*1 + k].re, tmp[6*1 + k].im }, + { tmp[6*2 + k].re, tmp[6*2 + k].im }, + { -tmp[6*1 + k].im, tmp[6*1 + k].re }, + { -tmp[6*2 + k].im, tmp[6*2 + k].re }, + { tmp[6*3 + k].re, tmp[6*3 + k].im }, + { tmp[6*4 + k].re, tmp[6*4 + k].im }, + { -tmp[6*3 + k].im, tmp[6*3 + k].re }, + { -tmp[6*4 + k].im, tmp[6*4 + k].re }, + }; + memcpy(s->exptab + 8*k, ac_exp, 8*sizeof(FFTComplex)); + } + + /* Specialcase when k = 0 */ + for (k = 0; k < 3; k++) { + FFTComplex dc_exp[] = { + { tmp[2*k + 0].re, tmp[2*k + 0].im }, + { tmp[2*k + 1].re, tmp[2*k + 1].im }, + { -tmp[2*k + 0].im, tmp[2*k + 0].re }, + { -tmp[2*k + 1].im, tmp[2*k + 1].re }, + }; + memcpy(s->exptab + 8*6 + 4*k, dc_exp, 4*sizeof(FFTComplex)); + } +} + +av_cold void ff_mdct15_init_aarch64(MDCT15Context *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (0 && have_neon(cpu_flags)) { + s->fft15 = ff_fft15_neon; + perm_twiddles(s); + } +} diff --git a/libavcodec/aarch64/mdct15_neon.S b/libavcodec/aarch64/mdct15_neon.S new file mode 100644 index 0000000000..aee12b25b8 --- /dev/null +++ b/libavcodec/aarch64/mdct15_neon.S @@ -0,0 +1,136 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +const sign_adj_pnnp, align=4 + .float 1, -1, -1, 1 +endconst + +.macro fft5 dc, ac1, ac2 + ld1r {v0.2d}, [x1], #8 // in[ 0].re, in[ 0].im, in[ 0].re, in[ 0].im + add x4, x1, #(2*2*4) + ld1 {v1.d}[0], [x4] // in[ 3].re, in[ 3].im, 0, 0 + add x4, x1, #(5*2*4) + ld1 {v1.d}[1], [x4] // in[ 3].re, in[ 3].im, in[ 6].re, in[ 6].im + add x4, x1, #(8*2*4) + ld1 {v2.d}[1], [x4] // 0, 0, in[ 9].re, in[ 9].im + add x4, x1, #(11*2*4) + ld1 {v2.d}[0], [x4] // in[12].re, in[12].im, in[ 9].re, in[ 9].im + + fsub v3.4s, v1.4s, v2.4s // t[1].im, t[1].re, t[3].im, t[3].re + fadd v1.4s, v1.4s, v2.4s // t[0].re, t[0].im, t[2].re, t[2].im + + mov v4.d[0], v3.d[1] + mov v4.d[1], v3.d[0] // t[3].im, t[3].re, t[1].im, t[1].re + + mov v2.d[0], v1.d[1] + mov v2.d[1], v1.d[0] // t[2].re, t[2].im, t[0].re, t[0].im + + fadd \dc\().2s, v0.2s, v1.2s + fadd \dc\().2s, \dc\().2s, v2.2s // DC[0].re, DC[0].im + + fmul v1.4s, v1.4s, v5.s[0] // exptab[0].re * t[0,2].re,im + fmul v3.4s, v3.4s, v5.s[1] // exptab[0].im * t[1,3].im,re + + fmla v1.4s, v2.4s, v5.s[2] // t[0].re, t[0].im, t[4].re, t[4].im + fmla v3.4s, v4.4s, v6.4s // t[1].im, t[1].re, t[5].im, t[5].re + + fadd v1.4s, v1.4s, v0.4s // in[0,0].re,im + t[0,4].re,im + rev64 v3.4s, v3.4s // t[1].re, t[1].im, t[5].re, t[5].im + + fmul v3.4s, v3.4s, v7.4s // (+--+)t[1,5].re,im + + fadd \ac1\().4s, v1.4s, v3.4s // out[1,2].re,im + fsub v2.4s, v1.4s, v3.4s // out[4,3].re,im + + mov \ac2\().d[0], v2.d[1] + mov \ac2\().d[1], v2.d[0] // out[3,4].re,im +.endm + +.macro butterflies_dc offset + add x4, x2, #(\offset) + ld1 {v0.4s, v1.4s}, [x4] + + uzp1 v2.4s, v19.4s, v22.4s // t[0].re, t[0].re, t[1].re, t[1].re + uzp2 v3.4s, v19.4s, v22.4s // t[0].im, t[0].im, t[1].im, t[1].im + + fmul v2.4s, v2.4s, v0.4s // t[0,1].rere * exptab[k,2*k].reim + fmla v2.4s, v3.4s, v1.4s // t[0].re, t[0].im, t[1].re, t[1].im + + mov v3.d[0], v2.d[1] // t[1].re, t[1].im + fadd v2.2s, v2.2s, v3.2s + fadd v2.2s, v2.2s, v16.2s + + st1 {v2.2s}, [x0], x3 +.endm + +.macro butterflies_ac offset, dc, ac1, ac2 + add x4, x2, #(\offset) + ld1 {v0.4s, v1.4s}, [x4] // exptab[01].reim, exptab[23].reim + add x4, x4, #64 + ld1 {v2.4s, v3.4s}, [x4] // exptab[01].reim, exptab[23].reim + + trn1 v4.4s, \ac1\().4s, \ac1\().4s // t[1].re, t[1].re, t[2].re, t[2].re + trn2 v5.4s, \ac1\().4s, \ac1\().4s // t[1].im, t[1].im, t[2].im, t[2].im + trn1 v6.4s, \ac2\().4s, \ac2\().4s + trn2 v7.4s, \ac2\().4s, \ac2\().4s + + fmul v0.4s, v4.4s, v0.4s // tmp2[1,1].re * exptab[1,2].reim + fmul v2.4s, v6.4s, v2.4s + + fmla v0.4s, v5.4s, v1.4s // t[0].re, t[0].im, t[0].re, t[1].im + fmla v2.4s, v7.4s, v3.4s + + fadd v0.4s, v0.4s, v2.4s + + fadd v0.4s, v0.4s, \dc\().4s // out[stride*1].reim, out[stride*2].reim + + st1 {v0.d}[0], [x0], x3 + st1 {v0.d}[1], [x0], x3 +.endm + +function ff_fft15_neon, export=1 + movrel x4, sign_adj_pnnp + ld1 {v7.4s}, [x4] + + add x4, x2, #(60*2*4) + ld1 {v5.4s, v6.4s}, [x4] + + fft5 v16, v17, v18 + fft5 v19, v20, v21 + fft5 v22, v23, v24 + + mov v19.d[1], v19.d[0] + mov v22.d[1], v22.d[0] + lsl x3, x3, #3 + + butterflies_dc (8*6 + 4*0)*2*4 + butterflies_ac (8*0 + 0*0)*2*4, v17, v20, v23 + butterflies_ac (8*0 + 1*4)*2*4, v18, v21, v24 + + butterflies_dc (8*6 + 4*1)*2*4 + butterflies_ac (8*2 + 0*0)*2*4, v17, v20, v23 + butterflies_ac (8*2 + 1*4)*2*4, v18, v21, v24 + + butterflies_dc (8*6 + 4*2)*2*4 + butterflies_ac (8*4 + 0*0)*2*4, v17, v20, v23 + butterflies_ac (8*4 + 1*4)*2*4, v18, v21, v24 + + ret +endfunc diff --git a/libavcodec/mdct15.c b/libavcodec/mdct15.c index 6f35059bfe..9e2003b1af 100644 --- a/libavcodec/mdct15.c +++ b/libavcodec/mdct15.c @@ -319,6 +319,9 @@ av_cold int ff_mdct15_init(MDCT15Context **ps, int inverse, int N, double scale) if (ARCH_X86) ff_mdct15_init_x86(s); + if (ARCH_AARCH64) + ff_mdct15_init_aarch64(s); + *ps = s; return 0; diff --git a/libavcodec/mdct15.h b/libavcodec/mdct15.h index 42e60f3e10..58a1946db0 100644 --- a/libavcodec/mdct15.h +++ b/libavcodec/mdct15.h @@ -58,5 +58,6 @@ int ff_mdct15_init(MDCT15Context **ps, int inverse, int N, double scale); void ff_mdct15_uninit(MDCT15Context **ps); void ff_mdct15_init_x86(MDCT15Context *s); +void ff_mdct15_init_aarch64(MDCT15Context *s); #endif /* AVCODEC_MDCT15_H */ -- 2.20.1

[FFmpeg-devel,2/2] aarch64/mdct15: implement a WIP fft15 NEON implementation

Commit Message

Patch