diff mbox series

[FFmpeg-devel] avfilter/scene_sad: add AArch64 SIMD

Message ID 20200201095717.4877-1-quinkblack@foxmail.com
State New
Headers show
Series [FFmpeg-devel] avfilter/scene_sad: add AArch64 SIMD | expand

Checks

Context Check Description
andriy/ffmpeg-patchwork fail Failed to apply patch

Commit Message

Zhao Zhili Feb. 1, 2020, 9:57 a.m. UTC
From: Zhao Zhili <quinkblack@foxmail.com>

For 8 bit depth:
    ./ffmpeg -threads 1 -f lavfi -t 10 -i 'yuvtestsrc=size=4096x2048,format=yuv444p' -vf 'freezedetect' -f null -benchmark -

    Test results on Snapdragon 845:
    Before:
        frame=  250 fps= 23 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.924x
	bench: utime=8.360s stime=2.350s rtime=10.820s
    After:
        frame=  250 fps= 51 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=2.04x
	bench: utime=2.650s stime=2.210s rtime=4.909s

    Test results on HiSilicon Kirin 970:
    Before:
        frame=  250 fps=6.0 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.239x
        bench: utime=35.156s stime=6.604s rtime=41.820s
    After:
        frame=  250 fps= 10 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.403x
	bench: utime=18.400s stime=6.376s rtime=24.798s

For 16 bit depth:
    ./ffmpeg -threads 1 -f lavfi -t 10 -i 'yuvtestsrc=size=4096x2048,format=yuv444p16' -vf 'freezedetect' -f null -benchmark -

    Test results on Snapdragon 845
    Before:
        frame=  250 fps= 19 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.756x
	bench: utime=8.700s stime=4.410s rtime=13.226s
    After:
	frame=  250 fps= 27 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=1.07x
	bench: utime=4.920s stime=4.350s rtime=9.356s

    Test results on HiSilicon Kirin 970:
    Before:
        frame=  250 fps=4.0 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.161x
	bench: utime=48.868s stime=13.124s rtime=62.110s
    After:
        frame=  250 fps=5.1 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.205x
	bench: utime=35.600s stime=13.036s rtime=48.708s
---
 libavfilter/aarch64/Makefile         |   2 +
 libavfilter/aarch64/scene_sad_init.c |  37 +++++++
 libavfilter/aarch64/scene_sad_neon.S | 149 +++++++++++++++++++++++++++
 libavfilter/scene_sad.c              |   2 +
 libavfilter/scene_sad.h              |   2 +
 5 files changed, 192 insertions(+)
 create mode 100644 libavfilter/aarch64/scene_sad_init.c
 create mode 100644 libavfilter/aarch64/scene_sad_neon.S

Comments

Marton Balint Feb. 1, 2020, 8:26 p.m. UTC | #1
On Sat, 1 Feb 2020, quinkblack@foxmail.com wrote:

> From: Zhao Zhili <quinkblack@foxmail.com>
>
> For 8 bit depth:
>    ./ffmpeg -threads 1 -f lavfi -t 10 -i 'yuvtestsrc=size=4096x2048,format=yuv444p' -vf 'freezedetect' -f null -benchmark -
>
>    Test results on Snapdragon 845:
>    Before:
>        frame=  250 fps= 23 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.924x
> 	bench: utime=8.360s stime=2.350s rtime=10.820s
>    After:
>        frame=  250 fps= 51 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=2.04x
> 	bench: utime=2.650s stime=2.210s rtime=4.909s
>
>    Test results on HiSilicon Kirin 970:
>    Before:
>        frame=  250 fps=6.0 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.239x
>        bench: utime=35.156s stime=6.604s rtime=41.820s
>    After:
>        frame=  250 fps= 10 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.403x
> 	bench: utime=18.400s stime=6.376s rtime=24.798s
>
> For 16 bit depth:
>    ./ffmpeg -threads 1 -f lavfi -t 10 -i 'yuvtestsrc=size=4096x2048,format=yuv444p16' -vf 'freezedetect' -f null -benchmark -
>
>    Test results on Snapdragon 845
>    Before:
>        frame=  250 fps= 19 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.756x
> 	bench: utime=8.700s stime=4.410s rtime=13.226s
>    After:
> 	frame=  250 fps= 27 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=1.07x
> 	bench: utime=4.920s stime=4.350s rtime=9.356s
>
>    Test results on HiSilicon Kirin 970:
>    Before:
>        frame=  250 fps=4.0 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.161x
> 	bench: utime=48.868s stime=13.124s rtime=62.110s
>    After:
>        frame=  250 fps=5.1 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.205x
> 	bench: utime=35.600s stime=13.036s rtime=48.708s
> ---
> libavfilter/aarch64/Makefile         |   2 +
> libavfilter/aarch64/scene_sad_init.c |  37 +++++++
> libavfilter/aarch64/scene_sad_neon.S | 149 +++++++++++++++++++++++++++
> libavfilter/scene_sad.c              |   2 +
> libavfilter/scene_sad.h              |   2 +
> 5 files changed, 192 insertions(+)
> create mode 100644 libavfilter/aarch64/scene_sad_init.c
> create mode 100644 libavfilter/aarch64/scene_sad_neon.S

Does your ASM handles cases when width is not a multiple of the 
vector size? If not, then you should probably do something similar to what 
is done for X86.

Thanks,
Marton

>
> diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
> index 6c727f9859..3a458f511f 100644
> --- a/libavfilter/aarch64/Makefile
> +++ b/libavfilter/aarch64/Makefile
> @@ -1,7 +1,9 @@
> OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/af_afir_init.o
> OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/af_anlmdn_init.o
> +OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/scene_sad_init.o
> OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/vf_nlmeans_init.o
> 
> NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/af_afir_neon.o
> NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/af_anlmdn_neon.o
> +NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/scene_sad_neon.o
> NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/vf_nlmeans_neon.o
> diff --git a/libavfilter/aarch64/scene_sad_init.c b/libavfilter/aarch64/scene_sad_init.c
> new file mode 100644
> index 0000000000..8de769ac10
> --- /dev/null
> +++ b/libavfilter/aarch64/scene_sad_init.c
> @@ -0,0 +1,37 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/cpu.h"
> +#include "libavfilter/scene_sad.h"
> +
> +void ff_scene_sad_neon(SCENE_SAD_PARAMS);
> +
> +void ff_scene_sad16_neon(SCENE_SAD_PARAMS);
> +
> +ff_scene_sad_fn ff_scene_sad_get_fn_aarch64(int depth)
> +{
> +    int cpu_flags = av_get_cpu_flags();
> +    if (have_neon(cpu_flags)) {
> +        if (depth == 8)
> +            return ff_scene_sad_neon;
> +        if (depth == 16)
> +            return ff_scene_sad16_neon;
> +    }
> +
> +    return NULL;
> +}
> diff --git a/libavfilter/aarch64/scene_sad_neon.S b/libavfilter/aarch64/scene_sad_neon.S
> new file mode 100644
> index 0000000000..5b3b027a53
> --- /dev/null
> +++ b/libavfilter/aarch64/scene_sad_neon.S
> @@ -0,0 +1,149 @@
> +/*
> + * Copyright (c) 2020 Zhao Zhili
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +
> +// void ff_scene_sadx_neon(const uint8_t *src1, ptrdiff_t stride1,
> +//                         const uint8_t *src2, ptrdiff_t stride2,
> +//                         ptrdiff_t width, ptrdiff_t height,
> +//                         uint64_t *sum)
> +.macro	scene_sad_neon, depth=8
> +	// x0: src1
> +	// x1: stride1
> +	// x2: src2
> +	// x3: stride2
> +	// x4: width
> +	// x5: height
> +	// x6: sum
> +
> +	// x7: step of width loop
> +	// x8: index of row
> +	// x9: width / x7 * x7
> +	// x10: sad
> +	// x11: index of column
> +	// w12: src1[x]
> +	// w13: src2[x]
> +
> +	mov	x8, xzr
> +	mov	x10, xzr
> +
> +.if \depth == 8
> +	mov	x7, #64
> +	and	x9, x4, #0xFFFFFFFFFFFFFFC0
> +.endif
> +
> +.if \depth == 16
> +	mov	x7, #32
> +	and	x9, x4, #0xFFFFFFFFFFFFFFE0
> +.endif
> +
> +1:	cmp	x4, x7		// check width
> +	mov	x11, xzr
> +	b.lt	3f
> +
> +	mov	v0.d[0], x10
> +
> +	// vector loop
> +2:
> +.if \depth == 8
> +	add	x14, x0, x11
> +	add	x15, x2, x11
> +.endif
> +
> +.if \depth == 16
> +	add	x14, x0, x11, lsl #1
> +	add	x15, x2, x11, lsl #1
> +.endif
> +	ld1	{v16.4S, v17.4S, v18.4S, v19.4S}, [x14]
> +	ld1	{v20.4S, v21.4S, v22.4S, v23.4S}, [x15]
> +	add	x11, x11, x7
> +	cmp	x9, x11
> +
> +.if \depth == 8
> +	uabd	v16.16B, v16.16B, v20.16B
> +	uabd	v17.16B, v17.16B, v21.16B
> +	uabd	v18.16B, v18.16B, v22.16B
> +	uabd	v19.16B, v19.16B, v23.16B
> +	uaddlv	h16, v16.16B
> +	uaddlv	h17, v17.16B
> +	uaddlv	h18, v18.16B
> +	uaddlv	h19, v19.16B
> +.endif
> +
> +.if \depth == 16
> +	uabd	v16.8H, v16.8H, v20.8H
> +	uabd	v17.8H, v17.8H, v21.8H
> +	uabd	v18.8H, v18.8H, v22.8H
> +	uabd	v19.8H, v19.8H, v23.8H
> +	uaddlv	s16, v16.8H
> +	uaddlv	s17, v17.8H
> +	uaddlv	s18, v18.8H
> +	uaddlv	s19, v19.8H
> +.endif
> +
> +	add	d16, d16, d17
> +	add	d18, d18, d19
> +	add	d0, d0, d16
> +	add	d0, d0, d18
> +
> +	b.ne	2b
> +
> +	cmp	x9, x4
> +	fmov	x10, d0
> +	b.eq	4f
> +
> +	// scalar loop
> +3:
> +.if \depth == 8
> +	ldrb	w12, [x0, x11]
> +	ldrb	w13, [x2, x11]
> +.endif
> +
> +.if \depth == 16
> +	ldrh	w12, [x0, x11, lsl #1]
> +	ldrh	w13, [x2, x11, lsl #1]
> +.endif
> +	add	x11, x11, #1
> +	subs	w12, w12, w13
> +	cneg	w12, w12, mi
> +	add	x10, x10, x12
> +	cmp	x11, x4
> +	b.ne	3b
> +
> +	// next row
> +4:
> +	add	x8, x8, #1              // =1
> +	add	x0, x0, x1
> +	cmp	x8, x5
> +	add	x2, x2, x3
> +	b.ne	1b
> +
> +5:
> +	str	x10, [x6]
> +	ret
> +.endm
> +
> +function ff_scene_sad_neon, export=1
> +	scene_sad_neon	depth=8
> +endfunc
> +
> +function ff_scene_sad16_neon, export=1
> +	scene_sad_neon	depth=16
> +endfunc
> diff --git a/libavfilter/scene_sad.c b/libavfilter/scene_sad.c
> index 73d3eacbfa..ee0c71f659 100644
> --- a/libavfilter/scene_sad.c
> +++ b/libavfilter/scene_sad.c
> @@ -61,6 +61,8 @@ ff_scene_sad_fn ff_scene_sad_get_fn(int depth)
>     ff_scene_sad_fn sad = NULL;
>     if (ARCH_X86)
>         sad = ff_scene_sad_get_fn_x86(depth);
> +    if (ARCH_AARCH64)
> +        sad = ff_scene_sad_get_fn_aarch64(depth);
>     if (!sad) {
>         if (depth == 8)
>             sad = ff_scene_sad_c;
> diff --git a/libavfilter/scene_sad.h b/libavfilter/scene_sad.h
> index 173a051f2b..c868200dc4 100644
> --- a/libavfilter/scene_sad.h
> +++ b/libavfilter/scene_sad.h
> @@ -37,6 +37,8 @@ void ff_scene_sad_c(SCENE_SAD_PARAMS);
> 
> void ff_scene_sad16_c(SCENE_SAD_PARAMS);
> 
> +ff_scene_sad_fn ff_scene_sad_get_fn_aarch64(int depth);
> +
> ff_scene_sad_fn ff_scene_sad_get_fn_x86(int depth);
> 
> ff_scene_sad_fn ff_scene_sad_get_fn(int depth);
> -- 
> 2.22.0
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Zhao Zhili Feb. 2, 2020, 4:54 a.m. UTC | #2
> On Feb 2, 2020, at 4:26 AM, Marton Balint <cus@passwd.hu> wrote:
> 
> 
> 
> On Sat, 1 Feb 2020, quinkblack@foxmail.com <mailto:quinkblack@foxmail.com> wrote:
> 
>> From: Zhao Zhili <quinkblack@foxmail.com>
>> 
>> For 8 bit depth:
>>   ./ffmpeg -threads 1 -f lavfi -t 10 -i 'yuvtestsrc=size=4096x2048,format=yuv444p' -vf 'freezedetect' -f null -benchmark -
>> 
>>   Test results on Snapdragon 845:
>>   Before:
>>       frame=  250 fps= 23 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.924x
>> 	bench: utime=8.360s stime=2.350s rtime=10.820s
>>   After:
>>       frame=  250 fps= 51 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=2.04x
>> 	bench: utime=2.650s stime=2.210s rtime=4.909s
>> 
>>   Test results on HiSilicon Kirin 970:
>>   Before:
>>       frame=  250 fps=6.0 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.239x
>>       bench: utime=35.156s stime=6.604s rtime=41.820s
>>   After:
>>       frame=  250 fps= 10 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.403x
>> 	bench: utime=18.400s stime=6.376s rtime=24.798s
>> 
>> For 16 bit depth:
>>   ./ffmpeg -threads 1 -f lavfi -t 10 -i 'yuvtestsrc=size=4096x2048,format=yuv444p16' -vf 'freezedetect' -f null -benchmark -
>> 
>>   Test results on Snapdragon 845
>>   Before:
>>       frame=  250 fps= 19 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.756x
>> 	bench: utime=8.700s stime=4.410s rtime=13.226s
>>   After:
>> 	frame=  250 fps= 27 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=1.07x
>> 	bench: utime=4.920s stime=4.350s rtime=9.356s
>> 
>>   Test results on HiSilicon Kirin 970:
>>   Before:
>>       frame=  250 fps=4.0 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.161x
>> 	bench: utime=48.868s stime=13.124s rtime=62.110s
>>   After:
>>       frame=  250 fps=5.1 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.205x
>> 	bench: utime=35.600s stime=13.036s rtime=48.708s
>> ---
>> libavfilter/aarch64/Makefile         |   2 +
>> libavfilter/aarch64/scene_sad_init.c |  37 +++++++
>> libavfilter/aarch64/scene_sad_neon.S | 149 +++++++++++++++++++++++++++
>> libavfilter/scene_sad.c              |   2 +
>> libavfilter/scene_sad.h              |   2 +
>> 5 files changed, 192 insertions(+)
>> create mode 100644 libavfilter/aarch64/scene_sad_init.c
>> create mode 100644 libavfilter/aarch64/scene_sad_neon.S
> 
> Does your ASM handles cases when width is not a multiple of the vector size? If not, then you should probably do something similar to what is done for X86.
> 

The code after `+	// scalar loop` handles that. It supports width and height >= 1.

> Thanks,
> Marton
> 
>> 
>> diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
>> index 6c727f9859..3a458f511f 100644
>> --- a/libavfilter/aarch64/Makefile
>> +++ b/libavfilter/aarch64/Makefile
>> @@ -1,7 +1,9 @@
>> OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/af_afir_init.o
>> OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/af_anlmdn_init.o
>> +OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/scene_sad_init.o
>> OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/vf_nlmeans_init.o
>> NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/af_afir_neon.o
>> NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/af_anlmdn_neon.o
>> +NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/scene_sad_neon.o
>> NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/vf_nlmeans_neon.o
>> diff --git a/libavfilter/aarch64/scene_sad_init.c b/libavfilter/aarch64/scene_sad_init.c
>> new file mode 100644
>> index 0000000000..8de769ac10
>> --- /dev/null
>> +++ b/libavfilter/aarch64/scene_sad_init.c
>> @@ -0,0 +1,37 @@
>> +/*
>> + * This file is part of FFmpeg.
>> + *
>> + * FFmpeg is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU Lesser General Public
>> + * License as published by the Free Software Foundation; either
>> + * version 2.1 of the License, or (at your option) any later version.
>> + *
>> + * FFmpeg is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> + * Lesser General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU Lesser General Public
>> + * License along with FFmpeg; if not, write to the Free Software
>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>> + */
>> +
>> +#include "libavutil/aarch64/cpu.h"
>> +#include "libavfilter/scene_sad.h"
>> +
>> +void ff_scene_sad_neon(SCENE_SAD_PARAMS);
>> +
>> +void ff_scene_sad16_neon(SCENE_SAD_PARAMS);
>> +
>> +ff_scene_sad_fn ff_scene_sad_get_fn_aarch64(int depth)
>> +{
>> +    int cpu_flags = av_get_cpu_flags();
>> +    if (have_neon(cpu_flags)) {
>> +        if (depth == 8)
>> +            return ff_scene_sad_neon;
>> +        if (depth == 16)
>> +            return ff_scene_sad16_neon;
>> +    }
>> +
>> +    return NULL;
>> +}
>> diff --git a/libavfilter/aarch64/scene_sad_neon.S b/libavfilter/aarch64/scene_sad_neon.S
>> new file mode 100644
>> index 0000000000..5b3b027a53
>> --- /dev/null
>> +++ b/libavfilter/aarch64/scene_sad_neon.S
>> @@ -0,0 +1,149 @@
>> +/*
>> + * Copyright (c) 2020 Zhao Zhili
>> + *
>> + * This file is part of FFmpeg.
>> + *
>> + * FFmpeg is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU Lesser General Public
>> + * License as published by the Free Software Foundation; either
>> + * version 2.1 of the License, or (at your option) any later version.
>> + *
>> + * FFmpeg is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> + * Lesser General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU Lesser General Public
>> + * License along with FFmpeg; if not, write to the Free Software
>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>> + */
>> +
>> +#include "libavutil/aarch64/asm.S"
>> +
>> +// void ff_scene_sadx_neon(const uint8_t *src1, ptrdiff_t stride1,
>> +//                         const uint8_t *src2, ptrdiff_t stride2,
>> +//                         ptrdiff_t width, ptrdiff_t height,
>> +//                         uint64_t *sum)
>> +.macro	scene_sad_neon, depth=8
>> +	// x0: src1
>> +	// x1: stride1
>> +	// x2: src2
>> +	// x3: stride2
>> +	// x4: width
>> +	// x5: height
>> +	// x6: sum
>> +
>> +	// x7: step of width loop
>> +	// x8: index of row
>> +	// x9: width / x7 * x7
>> +	// x10: sad
>> +	// x11: index of column
>> +	// w12: src1[x]
>> +	// w13: src2[x]
>> +
>> +	mov	x8, xzr
>> +	mov	x10, xzr
>> +
>> +.if \depth == 8
>> +	mov	x7, #64
>> +	and	x9, x4, #0xFFFFFFFFFFFFFFC0
>> +.endif
>> +
>> +.if \depth == 16
>> +	mov	x7, #32
>> +	and	x9, x4, #0xFFFFFFFFFFFFFFE0
>> +.endif
>> +
>> +1:	cmp	x4, x7		// check width
>> +	mov	x11, xzr
>> +	b.lt	3f
>> +
>> +	mov	v0.d[0], x10
>> +
>> +	// vector loop
>> +2:
>> +.if \depth == 8
>> +	add	x14, x0, x11
>> +	add	x15, x2, x11
>> +.endif
>> +
>> +.if \depth == 16
>> +	add	x14, x0, x11, lsl #1
>> +	add	x15, x2, x11, lsl #1
>> +.endif
>> +	ld1	{v16.4S, v17.4S, v18.4S, v19.4S}, [x14]
>> +	ld1	{v20.4S, v21.4S, v22.4S, v23.4S}, [x15]
>> +	add	x11, x11, x7
>> +	cmp	x9, x11
>> +
>> +.if \depth == 8
>> +	uabd	v16.16B, v16.16B, v20.16B
>> +	uabd	v17.16B, v17.16B, v21.16B
>> +	uabd	v18.16B, v18.16B, v22.16B
>> +	uabd	v19.16B, v19.16B, v23.16B
>> +	uaddlv	h16, v16.16B
>> +	uaddlv	h17, v17.16B
>> +	uaddlv	h18, v18.16B
>> +	uaddlv	h19, v19.16B
>> +.endif
>> +
>> +.if \depth == 16
>> +	uabd	v16.8H, v16.8H, v20.8H
>> +	uabd	v17.8H, v17.8H, v21.8H
>> +	uabd	v18.8H, v18.8H, v22.8H
>> +	uabd	v19.8H, v19.8H, v23.8H
>> +	uaddlv	s16, v16.8H
>> +	uaddlv	s17, v17.8H
>> +	uaddlv	s18, v18.8H
>> +	uaddlv	s19, v19.8H
>> +.endif
>> +
>> +	add	d16, d16, d17
>> +	add	d18, d18, d19
>> +	add	d0, d0, d16
>> +	add	d0, d0, d18
>> +
>> +	b.ne	2b
>> +
>> +	cmp	x9, x4
>> +	fmov	x10, d0
>> +	b.eq	4f
>> +
>> +	// scalar loop
>> +3:
>> +.if \depth == 8
>> +	ldrb	w12, [x0, x11]
>> +	ldrb	w13, [x2, x11]
>> +.endif
>> +
>> +.if \depth == 16
>> +	ldrh	w12, [x0, x11, lsl #1]
>> +	ldrh	w13, [x2, x11, lsl #1]
>> +.endif
>> +	add	x11, x11, #1
>> +	subs	w12, w12, w13
>> +	cneg	w12, w12, mi
>> +	add	x10, x10, x12
>> +	cmp	x11, x4
>> +	b.ne	3b
>> +
>> +	// next row
>> +4:
>> +	add	x8, x8, #1              // =1
>> +	add	x0, x0, x1
>> +	cmp	x8, x5
>> +	add	x2, x2, x3
>> +	b.ne	1b
>> +
>> +5:
>> +	str	x10, [x6]
>> +	ret
>> +.endm
>> +
>> +function ff_scene_sad_neon, export=1
>> +	scene_sad_neon	depth=8
>> +endfunc
>> +
>> +function ff_scene_sad16_neon, export=1
>> +	scene_sad_neon	depth=16
>> +endfunc
>> diff --git a/libavfilter/scene_sad.c b/libavfilter/scene_sad.c
>> index 73d3eacbfa..ee0c71f659 100644
>> --- a/libavfilter/scene_sad.c
>> +++ b/libavfilter/scene_sad.c
>> @@ -61,6 +61,8 @@ ff_scene_sad_fn ff_scene_sad_get_fn(int depth)
>>    ff_scene_sad_fn sad = NULL;
>>    if (ARCH_X86)
>>        sad = ff_scene_sad_get_fn_x86(depth);
>> +    if (ARCH_AARCH64)
>> +        sad = ff_scene_sad_get_fn_aarch64(depth);
>>    if (!sad) {
>>        if (depth == 8)
>>            sad = ff_scene_sad_c;
>> diff --git a/libavfilter/scene_sad.h b/libavfilter/scene_sad.h
>> index 173a051f2b..c868200dc4 100644
>> --- a/libavfilter/scene_sad.h
>> +++ b/libavfilter/scene_sad.h
>> @@ -37,6 +37,8 @@ void ff_scene_sad_c(SCENE_SAD_PARAMS);
>> void ff_scene_sad16_c(SCENE_SAD_PARAMS);
>> +ff_scene_sad_fn ff_scene_sad_get_fn_aarch64(int depth);
>> +
>> ff_scene_sad_fn ff_scene_sad_get_fn_x86(int depth);
>> ff_scene_sad_fn ff_scene_sad_get_fn(int depth);
>> -- 
>> 2.22.0
>> 
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org>
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel>
>> 
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org <mailto:ffmpeg-devel-request@ffmpeg.org> with subject "unsubscribe".
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org>
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel>
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org <mailto:ffmpeg-devel-request@ffmpeg.org> with subject "unsubscribe".
diff mbox series

Patch

diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
index 6c727f9859..3a458f511f 100644
--- a/libavfilter/aarch64/Makefile
+++ b/libavfilter/aarch64/Makefile
@@ -1,7 +1,9 @@ 
 OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/af_afir_init.o
 OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/af_anlmdn_init.o
+OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/scene_sad_init.o
 OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/vf_nlmeans_init.o
 
 NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/af_afir_neon.o
 NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/af_anlmdn_neon.o
+NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/scene_sad_neon.o
 NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/vf_nlmeans_neon.o
diff --git a/libavfilter/aarch64/scene_sad_init.c b/libavfilter/aarch64/scene_sad_init.c
new file mode 100644
index 0000000000..8de769ac10
--- /dev/null
+++ b/libavfilter/aarch64/scene_sad_init.c
@@ -0,0 +1,37 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/cpu.h"
+#include "libavfilter/scene_sad.h"
+
+void ff_scene_sad_neon(SCENE_SAD_PARAMS);
+
+void ff_scene_sad16_neon(SCENE_SAD_PARAMS);
+
+ff_scene_sad_fn ff_scene_sad_get_fn_aarch64(int depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (have_neon(cpu_flags)) {
+        if (depth == 8)
+            return ff_scene_sad_neon;
+        if (depth == 16)
+            return ff_scene_sad16_neon;
+    }
+
+    return NULL;
+}
diff --git a/libavfilter/aarch64/scene_sad_neon.S b/libavfilter/aarch64/scene_sad_neon.S
new file mode 100644
index 0000000000..5b3b027a53
--- /dev/null
+++ b/libavfilter/aarch64/scene_sad_neon.S
@@ -0,0 +1,149 @@ 
+/*
+ * Copyright (c) 2020 Zhao Zhili
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+// void ff_scene_sadx_neon(const uint8_t *src1, ptrdiff_t stride1,
+//                         const uint8_t *src2, ptrdiff_t stride2,
+//                         ptrdiff_t width, ptrdiff_t height,
+//                         uint64_t *sum)
+.macro	scene_sad_neon, depth=8
+	// x0: src1
+	// x1: stride1
+	// x2: src2
+	// x3: stride2
+	// x4: width
+	// x5: height
+	// x6: sum
+
+	// x7: step of width loop
+	// x8: index of row
+	// x9: width / x7 * x7
+	// x10: sad
+	// x11: index of column
+	// w12: src1[x]
+	// w13: src2[x]
+
+	mov	x8, xzr
+	mov	x10, xzr
+
+.if \depth == 8
+	mov	x7, #64
+	and	x9, x4, #0xFFFFFFFFFFFFFFC0
+.endif
+
+.if \depth == 16
+	mov	x7, #32
+	and	x9, x4, #0xFFFFFFFFFFFFFFE0
+.endif
+
+1:	cmp	x4, x7		// check width
+	mov	x11, xzr
+	b.lt	3f
+
+	mov	v0.d[0], x10
+
+	// vector loop
+2:
+.if \depth == 8
+	add	x14, x0, x11
+	add	x15, x2, x11
+.endif
+
+.if \depth == 16
+	add	x14, x0, x11, lsl #1
+	add	x15, x2, x11, lsl #1
+.endif
+	ld1	{v16.4S, v17.4S, v18.4S, v19.4S}, [x14]
+	ld1	{v20.4S, v21.4S, v22.4S, v23.4S}, [x15]
+	add	x11, x11, x7
+	cmp	x9, x11
+
+.if \depth == 8
+	uabd	v16.16B, v16.16B, v20.16B
+	uabd	v17.16B, v17.16B, v21.16B
+	uabd	v18.16B, v18.16B, v22.16B
+	uabd	v19.16B, v19.16B, v23.16B
+	uaddlv	h16, v16.16B
+	uaddlv	h17, v17.16B
+	uaddlv	h18, v18.16B
+	uaddlv	h19, v19.16B
+.endif
+
+.if \depth == 16
+	uabd	v16.8H, v16.8H, v20.8H
+	uabd	v17.8H, v17.8H, v21.8H
+	uabd	v18.8H, v18.8H, v22.8H
+	uabd	v19.8H, v19.8H, v23.8H
+	uaddlv	s16, v16.8H
+	uaddlv	s17, v17.8H
+	uaddlv	s18, v18.8H
+	uaddlv	s19, v19.8H
+.endif
+
+	add	d16, d16, d17
+	add	d18, d18, d19
+	add	d0, d0, d16
+	add	d0, d0, d18
+
+	b.ne	2b
+
+	cmp	x9, x4
+	fmov	x10, d0
+	b.eq	4f
+
+	// scalar loop
+3:
+.if \depth == 8
+	ldrb	w12, [x0, x11]
+	ldrb	w13, [x2, x11]
+.endif
+
+.if \depth == 16
+	ldrh	w12, [x0, x11, lsl #1]
+	ldrh	w13, [x2, x11, lsl #1]
+.endif
+	add	x11, x11, #1
+	subs	w12, w12, w13
+	cneg	w12, w12, mi
+	add	x10, x10, x12
+	cmp	x11, x4
+	b.ne	3b
+
+	// next row
+4:
+	add	x8, x8, #1              // =1
+	add	x0, x0, x1
+	cmp	x8, x5
+	add	x2, x2, x3
+	b.ne	1b
+
+5:
+	str	x10, [x6]
+	ret
+.endm
+
+function ff_scene_sad_neon, export=1
+	scene_sad_neon	depth=8
+endfunc
+
+function ff_scene_sad16_neon, export=1
+	scene_sad_neon	depth=16
+endfunc
diff --git a/libavfilter/scene_sad.c b/libavfilter/scene_sad.c
index 73d3eacbfa..ee0c71f659 100644
--- a/libavfilter/scene_sad.c
+++ b/libavfilter/scene_sad.c
@@ -61,6 +61,8 @@  ff_scene_sad_fn ff_scene_sad_get_fn(int depth)
     ff_scene_sad_fn sad = NULL;
     if (ARCH_X86)
         sad = ff_scene_sad_get_fn_x86(depth);
+    if (ARCH_AARCH64)
+        sad = ff_scene_sad_get_fn_aarch64(depth);
     if (!sad) {
         if (depth == 8)
             sad = ff_scene_sad_c;
diff --git a/libavfilter/scene_sad.h b/libavfilter/scene_sad.h
index 173a051f2b..c868200dc4 100644
--- a/libavfilter/scene_sad.h
+++ b/libavfilter/scene_sad.h
@@ -37,6 +37,8 @@  void ff_scene_sad_c(SCENE_SAD_PARAMS);
 
 void ff_scene_sad16_c(SCENE_SAD_PARAMS);
 
+ff_scene_sad_fn ff_scene_sad_get_fn_aarch64(int depth);
+
 ff_scene_sad_fn ff_scene_sad_get_fn_x86(int depth);
 
 ff_scene_sad_fn ff_scene_sad_get_fn(int depth);