diff mbox series

[FFmpeg-devel] avfilter/af_anlmdn: add AArch64 SIMD for compute_distance_ssd

Message ID 20200124091516.22108-1-quinkblack@foxmail.com
State New
Headers show
Series [FFmpeg-devel] avfilter/af_anlmdn: add AArch64 SIMD for compute_distance_ssd
Related show

Checks

Context Check Description
andriy/ffmpeg-patchwork pending
andriy/ffmpeg-patchwork fail Failed to apply patch

Commit Message

zhilizhao Jan. 24, 2020, 9:15 a.m. UTC
./ffmpeg -threads 1  -f lavfi -t 60 -i anoisesrc -af 'anlmdn' -f null -benchmark -

Test results on Snapdragon 845:
    Before:
	size=N/A time=00:01:00.00 bitrate=N/A speed=11.2x
	video:0kB audio:5625kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
	bench: utime=5.320s stime=0.010s rtime=5.358s
	bench: maxrss=14172kB

    After:
	size=N/A time=00:01:00.00 bitrate=N/A speed=15.4x
	video:0kB audio:5625kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
	bench: utime=3.870s stime=0.000s rtime=3.902s
	bench: maxrss=14036kB
---
 libavfilter/aarch64/Makefile         |   2 +
 libavfilter/aarch64/af_anlmdn_init.c |  31 ++++++++
 libavfilter/aarch64/af_anlmdn_neon.S | 112 +++++++++++++++++++++++++++
 libavfilter/af_anlmdn.c              |   3 +
 libavfilter/af_anlmdndsp.h           |   1 +
 5 files changed, 149 insertions(+)
 create mode 100644 libavfilter/aarch64/af_anlmdn_init.c
 create mode 100644 libavfilter/aarch64/af_anlmdn_neon.S

Comments

Carl Eugen Hoyos Jan. 24, 2020, 4:13 p.m. UTC | #1
Am Fr., 24. Jan. 2020 um 10:15 Uhr schrieb Zhao Zhili <quinkblack@foxmail.com>:
>
> ./ffmpeg -threads 1  -f lavfi -t 60 -i anoisesrc -af 'anlmdn' -f null -benchmark -
>
> Test results on Snapdragon 845:
>     Before:
>         size=N/A time=00:01:00.00 bitrate=N/A speed=11.2x
>         video:0kB audio:5625kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
>         bench: utime=5.320s stime=0.010s rtime=5.358s
>         bench: maxrss=14172kB
>
>     After:
>         size=N/A time=00:01:00.00 bitrate=N/A speed=15.4x
>         video:0kB audio:5625kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
>         bench: utime=3.870s stime=0.000s rtime=3.902s
>         bench: maxrss=14036kB

In case anybody is curious:
This is a higher speedup than the x86 asm optimization offers.

Carl Eugen
zhilizhao Feb. 1, 2020, 10:08 a.m. UTC | #2
> On Jan 25, 2020, at 12:13 AM, Carl Eugen Hoyos <ceffmpeg@gmail.com> wrote:
> 
> Am Fr., 24. Jan. 2020 um 10:15 Uhr schrieb Zhao Zhili <quinkblack@foxmail.com <mailto:quinkblack@foxmail.com>>:
>> 
>> ./ffmpeg -threads 1  -f lavfi -t 60 -i anoisesrc -af 'anlmdn' -f null -benchmark -
>> 
>> Test results on Snapdragon 845:
>>    Before:
>>        size=N/A time=00:01:00.00 bitrate=N/A speed=11.2x
>>        video:0kB audio:5625kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
>>        bench: utime=5.320s stime=0.010s rtime=5.358s
>>        bench: maxrss=14172kB
>> 
>>    After:
>>        size=N/A time=00:01:00.00 bitrate=N/A speed=15.4x
>>        video:0kB audio:5625kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
>>        bench: utime=3.870s stime=0.000s rtime=3.902s
>>        bench: maxrss=14036kB
> 
> In case anybody is curious:
> This is a higher speedup than the x86 asm optimization offers.
> 

Ping for review, thanks!

> Carl Eugen
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org>
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel>
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org <mailto:ffmpeg-devel-request@ffmpeg.org> with subject "unsubscribe".
diff mbox series

Patch

diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
index f52d7a4842..6c727f9859 100644
--- a/libavfilter/aarch64/Makefile
+++ b/libavfilter/aarch64/Makefile
@@ -1,5 +1,7 @@ 
 OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/af_afir_init.o
+OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/af_anlmdn_init.o
 OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/vf_nlmeans_init.o
 
 NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/af_afir_neon.o
+NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/af_anlmdn_neon.o
 NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/vf_nlmeans_neon.o
diff --git a/libavfilter/aarch64/af_anlmdn_init.c b/libavfilter/aarch64/af_anlmdn_init.c
new file mode 100644
index 0000000000..e28a152e04
--- /dev/null
+++ b/libavfilter/aarch64/af_anlmdn_init.c
@@ -0,0 +1,31 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/cpu.h"
+#include "libavfilter/af_anlmdndsp.h"
+
+float ff_compute_distance_ssd_neon(const float *f1, const float *f2,
+                                   ptrdiff_t len);
+
+av_cold void ff_anlmdn_init_aarch64(AudioNLMDNDSPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags))
+        s->compute_distance_ssd = ff_compute_distance_ssd_neon;
+}
diff --git a/libavfilter/aarch64/af_anlmdn_neon.S b/libavfilter/aarch64/af_anlmdn_neon.S
new file mode 100644
index 0000000000..3ad985b476
--- /dev/null
+++ b/libavfilter/aarch64/af_anlmdn_neon.S
@@ -0,0 +1,112 @@ 
+/*
+ * Copyright (c) 2020 Zhao Zhili
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+// float ff_compute_distance_ssd_neon(const float *f1, const float *f2, ptrdiff_t len);
+function ff_compute_distance_ssd_neon, export=1
+	fmov	s0, wzr
+	add	x3, x0, x2, lsl #2		// end of f1
+	sub	x0, x0, x2, lsl #2		// begin of f1
+	sub	x1, x1, x2, lsl #2		// begin of f2
+	add	x3, x3, #4			// end + 1 of f1
+
+	// process 32 pairs of data per loop
+	add	x4, x0, #128
+	cmp	x4, x3
+	b.gt	2f
+1:	ld1	{v16.4S, v17.4S, v18.4S, v19.4S}, [x0], #64
+	ld1	{v20.4S, v21.4S, v22.4S, v23.4S}, [x1], #64
+	ld1	{v24.4S, v25.4S, v26.4S, v27.4S}, [x0], #64
+	ld1	{v28.4S, v29.4S, v30.4S, v31.4S}, [x1], #64
+
+	fsub	v16.4S, v16.4S, v20.4S
+
+	fsub	v17.4S, v17.4S, v21.4S
+	fmul	v16.4S, v16.4S, v16.4S
+
+	fsub	v18.4S, v18.4S, v22.4S
+	fmul	v17.4S, v17.4S, v17.4S
+
+	fsub	v19.4S, v19.4S, v23.4S
+	fmla	v16.4S, v18.4S, v18.4S
+
+	fsub	v24.4S, v24.4S, v28.4S
+	fmla	v17.4S, v19.4S, v19.4S
+
+	fsub	v25.4S, v25.4S, v29.4S
+	fmla	v16.4S, v24.4S, v24.4S
+
+	fsub	v26.4S, v26.4S, v30.4S
+	fmla	v17.4S, v25.4S, v25.4S
+
+	fsub	v27.4S, v27.4S, v31.4S
+	fmla	v16.4S, v26.4S, v26.4S
+
+	fmla	v17.4S, v27.4S, v27.4S
+
+	fadd	v1.4S, v16.4S, v17.4S
+	faddp	v1.4S, v1.4S, v1.4S
+	faddp	s1, v1.2S
+	fadd	s0, s0, s1
+	add	x4, x0, #128
+	cmp	x4, x3
+	b.le	1b
+
+	// process 16 pairs of data per loop
+2:	add	x4, x0, #64
+	cmp	x4, x3
+	b.gt	4f
+3:	ld1	{v16.4S, v17.4S, v18.4S, v19.4S}, [x0], #64
+	ld1	{v20.4S, v21.4S, v22.4S, v23.4S}, [x1], #64
+
+	fsub	v16.4S, v16.4S, v20.4S
+
+	fsub	v17.4S, v17.4S, v21.4S
+	fmul	v16.4S, v16.4S, v16.4S
+
+	fsub	v18.4S, v18.4S, v22.4S
+	fmul	v17.4S, v17.4S, v17.4S
+
+	fsub	v19.4S, v19.4S, v23.4S
+	fmla	v16.4S, v18.4S, v18.4S
+
+	fmla	v17.4S, v19.4S, v19.4S
+
+	fadd	v1.4S, v16.4S, v17.4S
+	faddp	v1.4S, v1.4S, v1.4S
+	faddp	s1, v1.2S
+	fadd	s0, s0, s1
+	add	x4, x0, #64
+	cmp	x4, x3
+	b.le	3b
+
+	// process 1 pair of data per loop
+4:	cmp	x0, x3
+	b.eq	6f
+5:	ldr	s1, [x0], #4
+	ldr	s2, [x1], #4
+	fsub	s1, s1, s2
+	cmp	x0, x3
+	fmadd	s0, s1, s1, s0
+	b.ne	5b
+6:	ret
+
+endfunc
diff --git a/libavfilter/af_anlmdn.c b/libavfilter/af_anlmdn.c
index b8aef31c35..63bc1a1f2c 100644
--- a/libavfilter/af_anlmdn.c
+++ b/libavfilter/af_anlmdn.c
@@ -145,6 +145,9 @@  void ff_anlmdn_init(AudioNLMDNDSPContext *dsp)
 
     if (ARCH_X86)
         ff_anlmdn_init_x86(dsp);
+    if (ARCH_AARCH64) {
+        ff_anlmdn_init_aarch64(dsp);
+    }
 }
 
 static int config_output(AVFilterLink *outlink)
diff --git a/libavfilter/af_anlmdndsp.h b/libavfilter/af_anlmdndsp.h
index d8f5136cd8..f9d8a80c83 100644
--- a/libavfilter/af_anlmdndsp.h
+++ b/libavfilter/af_anlmdndsp.h
@@ -35,6 +35,7 @@  typedef struct AudioNLMDNDSPContext {
 } AudioNLMDNDSPContext;
 
 void ff_anlmdn_init(AudioNLMDNDSPContext *s);
+void ff_anlmdn_init_aarch64(AudioNLMDNDSPContext *s);
 void ff_anlmdn_init_x86(AudioNLMDNDSPContext *s);
 
 #endif /* AVFILTER_ANLMDNDSP_H */