From patchwork Sat Feb 1 09:57:17 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Zhao Zhili X-Patchwork-Id: 17643 Return-Path: X-Original-To: patchwork@ffaux-bg.ffmpeg.org Delivered-To: patchwork@ffaux-bg.ffmpeg.org Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by ffaux.localdomain (Postfix) with ESMTP id 4515944A0B9 for ; Sat, 1 Feb 2020 11:57:33 +0200 (EET) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 1B5DB689EF9; Sat, 1 Feb 2020 11:57:33 +0200 (EET) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from qq.com (out203-205-221-239.mail.qq.com [203.205.221.239]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 68576689DBD for ; Sat, 1 Feb 2020 11:57:24 +0200 (EET) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=qq.com; s=s201512; t=1580551040; bh=b0EUTZgwDmjiBrU2KrYVc8Env8Ue1qj+yhcSvZ9uxkg=; h=From:To:Cc:Subject:Date; b=d2Q9Wczf6UfgSGxlVv5PXYoAKtIe38m3Y+sf+dq+5k0ymAMR4Grl3vPbL6fR8tM5/ 1NJFVegOxBcxFjiFB2bsUisvvZAeYNM5rGdHf7yLPhNDTdpAOiN86CozUFmP4wv8hj DtbWH5A5DTOc2apf/SB/wf6oZAMTzRIa7nPu2MJo= Received: from ZHILIZHAO-MB1.tencent.com ([27.38.252.134]) by newxmesmtplogicsvrszc1.qq.com (NewEsmtp) with SMTP id E538F8CA; Sat, 01 Feb 2020 17:57:19 +0800 X-QQ-mid: xmsmtpt1580551039t9a70c8ws X-QQ-XMAILINFO: OJIMiv3+YDTcPQF5HbU5gZPvBX4DEt/ecUsMgHevtk+WrLqa0ofmctmw9cnlgp iC1h7IjrIzNvHRSG1ochlpXC4cGzvM0vzDHHZfl5u+c+GtqVubQnv62hv5NChIDtA87fqOfxaRFU Hdb6Df0wfNY4tEvK7uDu/gI2S2Ez4zxloo3JmI9QJD1EqbbcFLG1DKWwCTX6/qrRHc9F6jc0fXHk WrUGXFkh3QJ/Y8jSu/lTiVEFOYIfrvbDZHef+TrkxhHMe92xD0kA+f5y2PMwAv4D5tW6BNgSLDyY gLdJ3IyA0TIqUH9FBNC0qLB2FgBiOb3EMRApO8a2/Cyz81SpdgRQpans8nuwVBlLo1x5g5NjSZ1A kC+p1CYzB1BSDL+MbKOVWP3mIiW7dBCjMcACner8QJdhvJwM+smptAf9JPGMcjKV1XOVJfHxERTe BtfyIGVm1SrxjeuS84mCKC/W8fwpSCOewDRJpF4R06grjvyoaY3cAXQnYgxPFHjgpM5Yr1vczZ71 /ctKWZn6AcoS5nfrzsrMNxd69k3qjGfmPYoUVb5YWQihWA0eGtHhQcjjhC9jboMHx/uPqKxhVuX9 ORQSY= From: quinkblack@foxmail.com To: ffmpeg-devel@ffmpeg.org Date: Sat, 1 Feb 2020 17:57:17 +0800 Message-Id: <20200201095717.4877-1-quinkblack@foxmail.com> X-Mailer: git-send-email 2.22.0 MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH] avfilter/scene_sad: add AArch64 SIMD X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: Zhao Zhili Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" From: Zhao Zhili For 8 bit depth: ./ffmpeg -threads 1 -f lavfi -t 10 -i 'yuvtestsrc=size=4096x2048,format=yuv444p' -vf 'freezedetect' -f null -benchmark - Test results on Snapdragon 845: Before: frame= 250 fps= 23 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.924x bench: utime=8.360s stime=2.350s rtime=10.820s After: frame= 250 fps= 51 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=2.04x bench: utime=2.650s stime=2.210s rtime=4.909s Test results on HiSilicon Kirin 970: Before: frame= 250 fps=6.0 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.239x bench: utime=35.156s stime=6.604s rtime=41.820s After: frame= 250 fps= 10 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.403x bench: utime=18.400s stime=6.376s rtime=24.798s For 16 bit depth: ./ffmpeg -threads 1 -f lavfi -t 10 -i 'yuvtestsrc=size=4096x2048,format=yuv444p16' -vf 'freezedetect' -f null -benchmark - Test results on Snapdragon 845 Before: frame= 250 fps= 19 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.756x bench: utime=8.700s stime=4.410s rtime=13.226s After: frame= 250 fps= 27 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=1.07x bench: utime=4.920s stime=4.350s rtime=9.356s Test results on HiSilicon Kirin 970: Before: frame= 250 fps=4.0 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.161x bench: utime=48.868s stime=13.124s rtime=62.110s After: frame= 250 fps=5.1 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.205x bench: utime=35.600s stime=13.036s rtime=48.708s --- libavfilter/aarch64/Makefile | 2 + libavfilter/aarch64/scene_sad_init.c | 37 +++++++ libavfilter/aarch64/scene_sad_neon.S | 149 +++++++++++++++++++++++++++ libavfilter/scene_sad.c | 2 + libavfilter/scene_sad.h | 2 + 5 files changed, 192 insertions(+) create mode 100644 libavfilter/aarch64/scene_sad_init.c create mode 100644 libavfilter/aarch64/scene_sad_neon.S diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile index 6c727f9859..3a458f511f 100644 --- a/libavfilter/aarch64/Makefile +++ b/libavfilter/aarch64/Makefile @@ -1,7 +1,9 @@ OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/af_afir_init.o OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/af_anlmdn_init.o +OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/scene_sad_init.o OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/vf_nlmeans_init.o NEON-OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/af_afir_neon.o NEON-OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/af_anlmdn_neon.o +NEON-OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/scene_sad_neon.o NEON-OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/vf_nlmeans_neon.o diff --git a/libavfilter/aarch64/scene_sad_init.c b/libavfilter/aarch64/scene_sad_init.c new file mode 100644 index 0000000000..8de769ac10 --- /dev/null +++ b/libavfilter/aarch64/scene_sad_init.c @@ -0,0 +1,37 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/cpu.h" +#include "libavfilter/scene_sad.h" + +void ff_scene_sad_neon(SCENE_SAD_PARAMS); + +void ff_scene_sad16_neon(SCENE_SAD_PARAMS); + +ff_scene_sad_fn ff_scene_sad_get_fn_aarch64(int depth) +{ + int cpu_flags = av_get_cpu_flags(); + if (have_neon(cpu_flags)) { + if (depth == 8) + return ff_scene_sad_neon; + if (depth == 16) + return ff_scene_sad16_neon; + } + + return NULL; +} diff --git a/libavfilter/aarch64/scene_sad_neon.S b/libavfilter/aarch64/scene_sad_neon.S new file mode 100644 index 0000000000..5b3b027a53 --- /dev/null +++ b/libavfilter/aarch64/scene_sad_neon.S @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2020 Zhao Zhili + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +// void ff_scene_sadx_neon(const uint8_t *src1, ptrdiff_t stride1, +// const uint8_t *src2, ptrdiff_t stride2, +// ptrdiff_t width, ptrdiff_t height, +// uint64_t *sum) +.macro scene_sad_neon, depth=8 + // x0: src1 + // x1: stride1 + // x2: src2 + // x3: stride2 + // x4: width + // x5: height + // x6: sum + + // x7: step of width loop + // x8: index of row + // x9: width / x7 * x7 + // x10: sad + // x11: index of column + // w12: src1[x] + // w13: src2[x] + + mov x8, xzr + mov x10, xzr + +.if \depth == 8 + mov x7, #64 + and x9, x4, #0xFFFFFFFFFFFFFFC0 +.endif + +.if \depth == 16 + mov x7, #32 + and x9, x4, #0xFFFFFFFFFFFFFFE0 +.endif + +1: cmp x4, x7 // check width + mov x11, xzr + b.lt 3f + + mov v0.d[0], x10 + + // vector loop +2: +.if \depth == 8 + add x14, x0, x11 + add x15, x2, x11 +.endif + +.if \depth == 16 + add x14, x0, x11, lsl #1 + add x15, x2, x11, lsl #1 +.endif + ld1 {v16.4S, v17.4S, v18.4S, v19.4S}, [x14] + ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [x15] + add x11, x11, x7 + cmp x9, x11 + +.if \depth == 8 + uabd v16.16B, v16.16B, v20.16B + uabd v17.16B, v17.16B, v21.16B + uabd v18.16B, v18.16B, v22.16B + uabd v19.16B, v19.16B, v23.16B + uaddlv h16, v16.16B + uaddlv h17, v17.16B + uaddlv h18, v18.16B + uaddlv h19, v19.16B +.endif + +.if \depth == 16 + uabd v16.8H, v16.8H, v20.8H + uabd v17.8H, v17.8H, v21.8H + uabd v18.8H, v18.8H, v22.8H + uabd v19.8H, v19.8H, v23.8H + uaddlv s16, v16.8H + uaddlv s17, v17.8H + uaddlv s18, v18.8H + uaddlv s19, v19.8H +.endif + + add d16, d16, d17 + add d18, d18, d19 + add d0, d0, d16 + add d0, d0, d18 + + b.ne 2b + + cmp x9, x4 + fmov x10, d0 + b.eq 4f + + // scalar loop +3: +.if \depth == 8 + ldrb w12, [x0, x11] + ldrb w13, [x2, x11] +.endif + +.if \depth == 16 + ldrh w12, [x0, x11, lsl #1] + ldrh w13, [x2, x11, lsl #1] +.endif + add x11, x11, #1 + subs w12, w12, w13 + cneg w12, w12, mi + add x10, x10, x12 + cmp x11, x4 + b.ne 3b + + // next row +4: + add x8, x8, #1 // =1 + add x0, x0, x1 + cmp x8, x5 + add x2, x2, x3 + b.ne 1b + +5: + str x10, [x6] + ret +.endm + +function ff_scene_sad_neon, export=1 + scene_sad_neon depth=8 +endfunc + +function ff_scene_sad16_neon, export=1 + scene_sad_neon depth=16 +endfunc diff --git a/libavfilter/scene_sad.c b/libavfilter/scene_sad.c index 73d3eacbfa..ee0c71f659 100644 --- a/libavfilter/scene_sad.c +++ b/libavfilter/scene_sad.c @@ -61,6 +61,8 @@ ff_scene_sad_fn ff_scene_sad_get_fn(int depth) ff_scene_sad_fn sad = NULL; if (ARCH_X86) sad = ff_scene_sad_get_fn_x86(depth); + if (ARCH_AARCH64) + sad = ff_scene_sad_get_fn_aarch64(depth); if (!sad) { if (depth == 8) sad = ff_scene_sad_c; diff --git a/libavfilter/scene_sad.h b/libavfilter/scene_sad.h index 173a051f2b..c868200dc4 100644 --- a/libavfilter/scene_sad.h +++ b/libavfilter/scene_sad.h @@ -37,6 +37,8 @@ void ff_scene_sad_c(SCENE_SAD_PARAMS); void ff_scene_sad16_c(SCENE_SAD_PARAMS); +ff_scene_sad_fn ff_scene_sad_get_fn_aarch64(int depth); + ff_scene_sad_fn ff_scene_sad_get_fn_x86(int depth); ff_scene_sad_fn ff_scene_sad_get_fn(int depth);