From patchwork Wed Mar 8 10:00:56 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 2805 Delivered-To: ffmpegpatchwork@gmail.com Received: by 10.103.50.79 with SMTP id y76csp953152vsy; Wed, 8 Mar 2017 02:02:51 -0800 (PST) X-Received: by 10.28.11.205 with SMTP id 196mr4458375wml.31.1488967370970; Wed, 08 Mar 2017 02:02:50 -0800 (PST) Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100]) by mx.google.com with ESMTP id d12si3671643wrb.170.2017.03.08.02.02.50; Wed, 08 Mar 2017 02:02:50 -0800 (PST) Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Authentication-Results: mx.google.com; dkim=neutral (body hash did not verify) header.i=@martin-st.20150623.gappssmtp.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 149546882C8; Wed, 8 Mar 2017 12:01:20 +0200 (EET) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from mail-lf0-f52.google.com (mail-lf0-f52.google.com [209.85.215.52]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id E5D46688283 for ; Wed, 8 Mar 2017 12:01:16 +0200 (EET) Received: by mail-lf0-f52.google.com with SMTP id k202so12373174lfe.1 for ; Wed, 08 Mar 2017 02:01:30 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=martin-st.20150623.gappssmtp.com; s=20150623; h=from:to:subject:date:message-id:in-reply-to:references; bh=vzOT21ZwlwhXBhFyobdXAW9XgjLHdIwC0m3cLMQAfuA=; b=fz243Ix90NqfcxpFl+Ro8Z5qndfxMVOai6XB4uMDYuJBoUEn3sH5Q3xx0kdFiPuJPg L0M9Fa+2jyRISuC60amVqn7tnSGigRpCycDq61hoOp7tLiyKvBGvRp7BL/cv7WLtA1Lv Euk4PUdEiyjKAnGWc9gH1ot1hqrA3foDQ+uEScCVW0cfjIQK+gMndN898bkdTWrPK9pM n0gGGOeMQ0pHR965ZOBYEaDajlJIS7MSpZAQ14JdGEhiTfv0PbB6hP+MgIJ3pHdVWeg0 lof+QDkYAYgkwfoR2goPIIFC/azVyERqqSx+c4hU9KJeIjxY/5KLy5ZhdwkuZmhVbMGL H9PA== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:subject:date:message-id:in-reply-to :references; bh=vzOT21ZwlwhXBhFyobdXAW9XgjLHdIwC0m3cLMQAfuA=; b=NtsejoaG48NGtvH82mlpsnoxdXomCUt7CRFTnwfvQrLnqlK5mub10i+T5slm+7WQXe TjOsq+HTwVjutoQyVcSpdo8CVYhBsjyqGQh1Wv4P4w/zbBjR/YHUUecoiBlbgiRBpwi9 FJCVgkjCvCe0hKRbo+orzfZ/QmipCMlDe9DN5QNXUqwwy4sJN+HZJM7ImUzO+56qdt4M C4K8+GAlJ4D/tb7iqLlPRpZdh0ybVPlh6IK5GO+hn9II6pv8AQ22dkUenjszcVuSwvD/ AiK2I7zNsJMiJQdHiOBaCa8h4XiTG6UlVBoEzeZj24/p806h+A3U2rly4CSEYyzjgurX xeNg== X-Gm-Message-State: AMke39kXdJ4Jhqz1oQLuANZjNB/oTTqgyE43ZPZAlrBDNHm+owsh1T5wAVqIVX1GsRmI3Q== X-Received: by 10.46.82.197 with SMTP id n66mr1877407lje.110.1488967289305; Wed, 08 Mar 2017 02:01:29 -0800 (PST) Received: from localhost.localdomain ([2001:470:28:852:7d47:68e:13e8:4933]) by smtp.gmail.com with ESMTPSA id m127sm513064lfg.58.2017.03.08.02.01.28 for (version=TLS1_2 cipher=ECDHE-RSA-AES128-SHA bits=128/128); Wed, 08 Mar 2017 02:01:28 -0800 (PST) From: =?UTF-8?q?Martin=20Storsj=C3=B6?= To: ffmpeg-devel@ffmpeg.org Date: Wed, 8 Mar 2017 12:00:56 +0200 Message-Id: <1488967274-8143-16-git-send-email-martin@martin.st> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1488967274-8143-1-git-send-email-martin@martin.st> References: <1488967274-8143-1-git-send-email-martin@martin.st> Subject: [FFmpeg-devel] [PATCH 16/34] arm: vp9mc: Calculate less unused data in the 4 pixel wide horizontal filter X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches MIME-Version: 1.0 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" Before: Cortex A7 A8 A9 A53 vp9_put_8tap_smooth_4h_neon: 378.1 273.2 340.7 229.5 After: vp9_put_8tap_smooth_4h_neon: 352.1 222.2 290.5 229.5 This is cherrypicked from libav commit fea92a4b57d1c328b1de226a5f213a629ee63754. --- libavcodec/arm/vp9mc_neon.S | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/libavcodec/arm/vp9mc_neon.S b/libavcodec/arm/vp9mc_neon.S index 83235ff..bd8cda7 100644 --- a/libavcodec/arm/vp9mc_neon.S +++ b/libavcodec/arm/vp9mc_neon.S @@ -209,7 +209,7 @@ endfunc @ Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6 @ for size >= 16), and multiply-accumulate into dst1 and dst3 (or @ dst1-dst2 and dst3-dst4 for size >= 16) -.macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size +.macro extmla dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, src4, src5, src6, offset, size vext.8 q14, \src1, \src2, #(2*\offset) vext.8 q15, \src4, \src5, #(2*\offset) .if \size >= 16 @@ -219,14 +219,17 @@ endfunc vext.8 q6, \src5, \src6, #(2*\offset) vmla_lane \dst2, q5, \offset vmla_lane \dst4, q6, \offset -.else +.elseif \size == 8 vmla_lane \dst1, q14, \offset vmla_lane \dst3, q15, \offset +.else + vmla_lane \dst1d, d28, \offset + vmla_lane \dst3d, d30, \offset .endif .endm @ The same as above, but don't accumulate straight into the @ destination, but use a temp register and accumulate with saturation. -.macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size +.macro extmulqadd dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, src4, src5, src6, offset, size vext.8 q14, \src1, \src2, #(2*\offset) vext.8 q15, \src4, \src5, #(2*\offset) .if \size >= 16 @@ -236,16 +239,24 @@ endfunc vext.8 q6, \src5, \src6, #(2*\offset) vmul_lane q5, q5, \offset vmul_lane q6, q6, \offset -.else +.elseif \size == 8 vmul_lane q14, q14, \offset vmul_lane q15, q15, \offset +.else + vmul_lane d28, d28, \offset + vmul_lane d30, d30, \offset .endif +.if \size == 4 + vqadd.s16 \dst1d, \dst1d, d28 + vqadd.s16 \dst3d, \dst3d, d30 +.else vqadd.s16 \dst1, \dst1, q14 vqadd.s16 \dst3, \dst3, q15 .if \size >= 16 vqadd.s16 \dst2, \dst2, q5 vqadd.s16 \dst4, \dst4, q6 .endif +.endif .endm @@ -308,13 +319,13 @@ function \type\()_8tap_\size\()h_\idx1\idx2 vmul.s16 q2, q9, d0[0] vmul.s16 q4, q12, d0[0] .endif - extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, 1, \size - extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, 2, \size - extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, \idx1, \size - extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, 5, \size - extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, 6, \size - extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, 7, \size - extmulqadd q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, \idx2, \size + extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 1, \size + extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 2, \size + extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, \idx1, \size + extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 5, \size + extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 6, \size + extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 7, \size + extmulqadd q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, \idx2, \size @ Round, shift and saturate vqrshrun.s16 d2, q1, #7