From patchwork Wed Mar 8 10:01:06 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 2814 Delivered-To: ffmpegpatchwork@gmail.com Received: by 10.103.50.79 with SMTP id y76csp953421vsy; Wed, 8 Mar 2017 02:03:36 -0800 (PST) X-Received: by 10.28.133.203 with SMTP id h194mr4770792wmd.122.1488967416690; Wed, 08 Mar 2017 02:03:36 -0800 (PST) Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100]) by mx.google.com with ESMTP id w39si3684503wrc.140.2017.03.08.02.03.36; Wed, 08 Mar 2017 02:03:36 -0800 (PST) Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Authentication-Results: mx.google.com; dkim=neutral (body hash did not verify) header.i=@martin-st.20150623.gappssmtp.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id E6C186882EF; Wed, 8 Mar 2017 12:01:26 +0200 (EET) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from mail-lf0-f67.google.com (mail-lf0-f67.google.com [209.85.215.67]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id A0DAF6882AE for ; Wed, 8 Mar 2017 12:01:24 +0200 (EET) Received: by mail-lf0-f67.google.com with SMTP id g70so1985396lfh.3 for ; Wed, 08 Mar 2017 02:01:38 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=martin-st.20150623.gappssmtp.com; s=20150623; h=from:to:subject:date:message-id:in-reply-to:references; bh=tn+ExYuuSXjKUk4WqKgMeenWQi/2ukBCefEhMhgNE1Q=; b=p1HNMyh/7qkzNlBnZlAuc6rmoA6Hyy3SAHpnrl/nAk/0wNEoxfY93kcpiHn7exl6YC nxyetLBxEdgM9MdFSkuvOuw+tvzgUOwhIJJRptcBijH8UkEN2z3AGJcHV/Z42BD7UVZ4 tcyqK+F1iJch9bzxib42wXr6Xx7AznGbU8cBNN2xouQaNsV2Wi32b7EJKwPOz3BBvQ0p aCsWek4DwCObr5hLmcwgEshX18dMDxRoHlYADHVJ8TomjBfUGQ/aSxzO/QcMhLY6lVr2 ZEeNAkXXezBDWuXTIgGN2hz7UbmC+DFqdxNuYbAOrl4Bx64yHPmHuyEQ++BfnIS1yUsw 9KkQ== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:subject:date:message-id:in-reply-to :references; bh=tn+ExYuuSXjKUk4WqKgMeenWQi/2ukBCefEhMhgNE1Q=; b=d5NsyluUH7goMshXo85JyFtAxoG0A2ueLw3z9bO1WMsmdbFuh+iy0rUQm8iPhlQ3U9 rd+aAp6l3ShSriaTWKDqk+RxxDWiMakgXEoJPdqRjd8NaGs8kR7Mi1T1pow2mgaV5Io4 rV1Gzwsqcf7YXu/ipEF1caerqHulnsjqWtAybgERrtK1Obw0iMDbDvVzeusZXJiHbJtd zPKLn0uk7slnpJOyaxWiAoh8a8f364hLqZ7i/LVxb6NUx5Hg4jAMhFI/SUGSeW3Oz2Ql tScIvcJHPWBVCOydkXRnzpec4i6ch4KArDA9Yz68bF3wNTk0eVdd0QzX02CtX3napt8d xVWQ== X-Gm-Message-State: AMke39lIJamTJh2RdxXdcDKPD7txyLqKzwYGjsf9DyR7DR98zooRqGo3ZefRPhaPtT5RHg== X-Received: by 10.25.227.5 with SMTP id a5mr1508780lfh.118.1488967297257; Wed, 08 Mar 2017 02:01:37 -0800 (PST) Received: from localhost.localdomain ([2001:470:28:852:7d47:68e:13e8:4933]) by smtp.gmail.com with ESMTPSA id m127sm513064lfg.58.2017.03.08.02.01.36 for (version=TLS1_2 cipher=ECDHE-RSA-AES128-SHA bits=128/128); Wed, 08 Mar 2017 02:01:36 -0800 (PST) From: =?UTF-8?q?Martin=20Storsj=C3=B6?= To: ffmpeg-devel@ffmpeg.org Date: Wed, 8 Mar 2017 12:01:06 +0200 Message-Id: <1488967274-8143-26-git-send-email-martin@martin.st> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1488967274-8143-1-git-send-email-martin@martin.st> References: <1488967274-8143-1-git-send-email-martin@martin.st> Subject: [FFmpeg-devel] [PATCH 26/34] arm/aarch64: vp9lpf: Keep the comparison to E within 8 bit X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches MIME-Version: 1.0 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" The theoretical maximum value of E is 193, so we can just saturate the addition to 255. Before: Cortex A7 A8 A9 A53 A53/AArch64 vp9_loop_filter_v_4_8_neon: 143.0 127.7 114.8 88.0 87.7 vp9_loop_filter_v_8_8_neon: 241.0 197.2 173.7 140.0 136.7 vp9_loop_filter_v_16_8_neon: 497.0 419.5 379.7 293.0 275.7 vp9_loop_filter_v_16_16_neon: 965.2 818.7 731.4 579.0 452.0 After: vp9_loop_filter_v_4_8_neon: 136.0 125.7 112.6 84.0 83.0 vp9_loop_filter_v_8_8_neon: 234.0 195.5 171.5 136.0 133.7 vp9_loop_filter_v_16_8_neon: 490.0 417.5 377.7 289.0 271.0 vp9_loop_filter_v_16_16_neon: 951.2 814.7 732.3 571.0 446.7 This is cherrypicked from libav commit c582cb8537367721bb399a5d01b652c20142b756. --- libavcodec/aarch64/vp9lpf_neon.S | 40 +++++++++------------------------------- libavcodec/arm/vp9lpf_neon.S | 11 +++++------ 2 files changed, 14 insertions(+), 37 deletions(-) diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S index ebfd9be..a9eea7f 100644 --- a/libavcodec/aarch64/vp9lpf_neon.S +++ b/libavcodec/aarch64/vp9lpf_neon.S @@ -51,13 +51,6 @@ // see the arm version instead. -.macro uabdl_sz dst1, dst2, in1, in2, sz - uabdl \dst1, \in1\().8b, \in2\().8b -.ifc \sz, .16b - uabdl2 \dst2, \in1\().16b, \in2\().16b -.endif -.endm - .macro add_sz dst1, dst2, in1, in2, in3, in4, sz add \dst1, \in1, \in3 .ifc \sz, .16b @@ -86,20 +79,6 @@ .endif .endm -.macro cmhs_sz dst1, dst2, in1, in2, in3, in4, sz - cmhs \dst1, \in1, \in3 -.ifc \sz, .16b - cmhs \dst2, \in2, \in4 -.endif -.endm - -.macro xtn_sz dst, in1, in2, sz - xtn \dst\().8b, \in1 -.ifc \sz, .16b - xtn2 \dst\().16b, \in2 -.endif -.endm - .macro usubl_sz dst1, dst2, in1, in2, sz usubl \dst1, \in1\().8b, \in2\().8b .ifc \sz, .16b @@ -179,20 +158,20 @@ // tmpq2 == tmp3 + tmp4, etc. .macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8 .if \mix == 0 - dup v0.8h, w2 // E - dup v1.8h, w2 // E + dup v0\sz, w2 // E dup v2\sz, w3 // I dup v3\sz, w4 // H .else - dup v0.8h, w2 // E + dup v0.8b, w2 // E dup v2.8b, w3 // I dup v3.8b, w4 // H + lsr w5, w2, #8 lsr w6, w3, #8 lsr w7, w4, #8 - ushr v1.8h, v0.8h, #8 // E + dup v1.8b, w5 // E dup v4.8b, w6 // I - bic v0.8h, #255, lsl 8 // E dup v5.8b, w7 // H + trn1 v0.2d, v0.2d, v1.2d trn1 v2.2d, v2.2d, v4.2d trn1 v3.2d, v3.2d, v5.2d .endif @@ -206,16 +185,15 @@ umax v4\sz, v4\sz, v5\sz umax v5\sz, v6\sz, v7\sz umax \tmp1\sz, \tmp1\sz, \tmp2\sz - uabdl_sz v6.8h, v7.8h, v23, v24, \sz // abs(p0 - q0) + uabd v6\sz, v23\sz, v24\sz // abs(p0 - q0) umax v4\sz, v4\sz, v5\sz - add_sz v6.8h, v7.8h, v6.8h, v7.8h, v6.8h, v7.8h, \sz // abs(p0 - q0) * 2 + uqadd v6\sz, v6\sz, v6\sz // abs(p0 - q0) * 2 uabd v5\sz, v22\sz, v25\sz // abs(p1 - q1) umax v4\sz, v4\sz, \tmp1\sz // max(abs(p3 - p2), ..., abs(q2 - q3)) ushr v5\sz, v5\sz, #1 cmhs v4\sz, v2\sz, v4\sz // max(abs()) <= I - uaddw_sz v6.8h, v7.8h, v6.8h, v7.8h, v5, \sz // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 - cmhs_sz v6.8h, v7.8h, v0.8h, v1.8h, v6.8h, v7.8h, \sz - xtn_sz v5, v6.8h, v7.8h, \sz + uqadd v6\sz, v6\sz, v5\sz // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 + cmhs v5\sz, v0\sz, v6\sz and v4\sz, v4\sz, v5\sz // fm // If no pixels need filtering, just exit as soon as possible diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S index b90c536..2d91092 100644 --- a/libavcodec/arm/vp9lpf_neon.S +++ b/libavcodec/arm/vp9lpf_neon.S @@ -51,7 +51,7 @@ @ and d28-d31 as temp registers, or d8-d15. @ tmp1,tmp2 = tmpq1, tmp3,tmp4 = tmpq2, tmp5,tmp6 = tmpq3, tmp7,tmp8 = tmpq4 .macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmpq1, tmpq2, tmpq3, tmpq4 - vdup.u16 q0, r2 @ E + vdup.u8 d0, r2 @ E vdup.u8 d2, r3 @ I ldr r3, [sp] @@ -64,16 +64,15 @@ vmax.u8 d4, d4, d5 vmax.u8 d5, d6, d7 vmax.u8 \tmp1, \tmp1, \tmp2 - vabdl.u8 q3, d23, d24 @ abs(p0 - q0) + vabd.u8 d6, d23, d24 @ abs(p0 - q0) vmax.u8 d4, d4, d5 - vadd.u16 q3, q3, q3 @ abs(p0 - q0) * 2 + vqadd.u8 d6, d6, d6 @ abs(p0 - q0) * 2 vabd.u8 d5, d22, d25 @ abs(p1 - q1) vmax.u8 d4, d4, \tmp1 @ max(abs(p3 - p2), ..., abs(q2 - q3)) vshr.u8 d5, d5, #1 vcle.u8 d4, d4, d2 @ max(abs()) <= I - vaddw.u8 q3, q3, d5 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 - vcle.u16 q3, q3, q0 - vmovn.u16 d5, q3 + vqadd.u8 d6, d6, d5 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 + vcle.u8 d5, d6, d0 vand d4, d4, d5 @ fm vdup.u8 d3, r3 @ H