From patchwork Thu Mar 16 22:10:08 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 2968 Delivered-To: ffmpegpatchwork@gmail.com Received: by 10.103.50.79 with SMTP id y76csp3516vsy; Thu, 16 Mar 2017 15:17:14 -0700 (PDT) X-Received: by 10.28.30.79 with SMTP id e76mr50527wme.96.1489702634307; Thu, 16 Mar 2017 15:17:14 -0700 (PDT) Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100]) by mx.google.com with ESMTP id h9si8375328wrc.243.2017.03.16.15.17.07; Thu, 16 Mar 2017 15:17:14 -0700 (PDT) Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Authentication-Results: mx.google.com; dkim=neutral (body hash did not verify) header.i=@martin-st.20150623.gappssmtp.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 78E6A68831B; Fri, 17 Mar 2017 00:16:48 +0200 (EET) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from mail-lf0-f50.google.com (mail-lf0-f50.google.com [209.85.215.50]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 14076688266 for ; Fri, 17 Mar 2017 00:16:42 +0200 (EET) Received: by mail-lf0-f50.google.com with SMTP id a6so26238129lfa.0 for ; Thu, 16 Mar 2017 15:16:59 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=martin-st.20150623.gappssmtp.com; s=20150623; h=from:to:subject:date:message-id:in-reply-to:references; bh=9jspCzV/wP9mLUOeZD0urvWSU35+ssT23pQllAqI6cM=; b=s/62/SUmu6Rgl1eQtUUvr0LCoJVty9SqR98LahE3D2NCDzmyLQ+rDrcozA3jFqT3Bs LT7TDdcgw9YIEofzLGIjcmWW6sChIdTN3zvZQ3EYcD1v+RmWXr5X2px/hDcrE4pT3dxP mA5NAsE2bOeA8+nQZXbmhgPZvO1yKY0SeurVOthjZkETLoqfbMqjdkvAIP3vnJLfHIBD TtAoeHLDUeicMxxcNP+rAA8yvbzxUTroODABbOyuw8W2r8d56Xe3R2rTMCZsOy7Bs2V6 2PraiXVUf98PJq5fNd5uwaE9KJMs/DHh3z5m6pK4IJ2MMQgSDAMW6V+vCdfFh+Oos6s8 YL8Q== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:subject:date:message-id:in-reply-to :references; bh=9jspCzV/wP9mLUOeZD0urvWSU35+ssT23pQllAqI6cM=; b=AHRMhfygeOznsrfinYq4ErJ0nnFmK6Kuv59LVYofYggOofrS0ntbQ9Vg34XpEnlh1v 7DhwrioVcaQdfMrn1JAEdOuzX9ZcldZippOGU2I0ydZKQGPW5EiJKxbNlKv+CTbXsnp5 gxTtDJX8oBFsPfCiefpBrMQYhaonu6Nd0MkM0vY04gGIdZLcyWg+YedNaSlNQHNF0WOr nFGqhWVLDN+cLI/+G7F0Dbrpo7apwDZv8gATWoZimlWJJIzpSVO1uqztIrEPxK/TwfMb EHn2jeUwxlu3Idmi0wAhB9FPskprL7Drwmu5s2fEIEEPE/uroaJX0Qjk8ohuJk1Deg+P muIg== X-Gm-Message-State: AFeK/H2Tz97jBJ8It96u+40gxeXB6SI1G0R7OZMuhurRLpsyNr/YzzKn9ahD8OFR8Bwx7g== X-Received: by 10.46.13.9 with SMTP id 9mr3724473ljn.40.1489702223667; Thu, 16 Mar 2017 15:10:23 -0700 (PDT) Received: from localhost.localdomain ([2001:470:28:852:10ad:e858:1f3b:5c2c]) by smtp.gmail.com with ESMTPSA id g3sm1124718lfe.34.2017.03.16.15.10.22 for (version=TLS1_2 cipher=ECDHE-RSA-AES128-SHA bits=128/128); Thu, 16 Mar 2017 15:10:23 -0700 (PDT) From: =?UTF-8?q?Martin=20Storsj=C3=B6?= To: ffmpeg-devel@ffmpeg.org Date: Fri, 17 Mar 2017 00:10:08 +0200 Message-Id: <1489702219-12643-3-git-send-email-martin@martin.st> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1489702219-12643-1-git-send-email-martin@martin.st> References: <1489702219-12643-1-git-send-email-martin@martin.st> Subject: [FFmpeg-devel] [PATCH 03/14] arm/aarch64: vp9: Fix vertical alignment X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches MIME-Version: 1.0 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" Align the second/third operands as they usually are. Due to the wildly varying sizes of the written out operands in aarch64 assembly, the column alignment is usually not as clear as in arm assembly. This is cherrypicked from libav commit 7995ebfad12002033c73feed422a1cfc62081e8f. --- libavcodec/aarch64/vp9itxfm_neon.S | 36 ++++++++++++++++++------------------ libavcodec/arm/vp9itxfm_neon.S | 14 +++++++------- libavcodec/arm/vp9lpf_neon.S | 2 +- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S index 3e5da08..b12890f 100644 --- a/libavcodec/aarch64/vp9itxfm_neon.S +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -380,7 +380,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1 .ifc \txfm1\()_\txfm2,idct_idct movrel x4, idct_coeffs .else - movrel x4, iadst8_coeffs + movrel x4, iadst8_coeffs ld1 {v1.8h}, [x4], #16 .endif ld1 {v0.8h}, [x4] @@ -480,23 +480,23 @@ itxfm_func8x8 iadst, iadst function idct16x16_dc_add_neon - movrel x4, idct_coeffs + movrel x4, idct_coeffs ld1 {v0.4h}, [x4] - movi v1.4h, #0 + movi v1.4h, #0 ld1 {v2.h}[0], [x2] - smull v2.4s, v2.4h, v0.h[0] - rshrn v2.4h, v2.4s, #14 - smull v2.4s, v2.4h, v0.h[0] - rshrn v2.4h, v2.4s, #14 + smull v2.4s, v2.4h, v0.h[0] + rshrn v2.4h, v2.4s, #14 + smull v2.4s, v2.4h, v0.h[0] + rshrn v2.4h, v2.4s, #14 dup v2.8h, v2.h[0] st1 {v1.h}[0], [x2] - srshr v2.8h, v2.8h, #6 + srshr v2.8h, v2.8h, #6 - mov x3, x0 - mov x4, #16 + mov x3, x0 + mov x4, #16 1: // Loop to add the constant from v2 into all 16x16 outputs subs x4, x4, #2 @@ -869,7 +869,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 .ifc \txfm1,idct ld1 {v0.8h,v1.8h}, [x10] .endif - mov x9, #32 + mov x9, #32 .ifc \txfm1\()_\txfm2,idct_idct cmp w3, #10 @@ -1046,10 +1046,10 @@ idct16_partial quarter idct16_partial half function idct32x32_dc_add_neon - movrel x4, idct_coeffs + movrel x4, idct_coeffs ld1 {v0.4h}, [x4] - movi v1.4h, #0 + movi v1.4h, #0 ld1 {v2.h}[0], [x2] smull v2.4s, v2.4h, v0.h[0] @@ -1059,10 +1059,10 @@ function idct32x32_dc_add_neon dup v2.8h, v2.h[0] st1 {v1.h}[0], [x2] - srshr v0.8h, v2.8h, #6 + srshr v0.8h, v2.8h, #6 - mov x3, x0 - mov x4, #32 + mov x3, x0 + mov x4, #32 1: // Loop to add the constant v0 into all 32x32 outputs subs x4, x4, #2 @@ -1230,7 +1230,7 @@ endfunc // x9 = double input stride function idct32_1d_8x32_pass1\suffix\()_neon mov x14, x30 - movi v2.8h, #0 + movi v2.8h, #0 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) .ifb \suffix @@ -1295,7 +1295,7 @@ function idct32_1d_8x32_pass1\suffix\()_neon .endif add x2, x2, #64 - movi v2.8h, #0 + movi v2.8h, #0 // v16 = IN(1), v17 = IN(3) ... v31 = IN(31) .ifb \suffix .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S index 6d4d765..6c09922 100644 --- a/libavcodec/arm/vp9itxfm_neon.S +++ b/libavcodec/arm/vp9itxfm_neon.S @@ -530,7 +530,7 @@ function idct16x16_dc_add_neon movrel r12, idct_coeffs vld1.16 {d0}, [r12,:64] - vmov.i16 q2, #0 + vmov.i16 q2, #0 vld1.16 {d16[]}, [r2,:16] vmull.s16 q8, d16, d0[0] @@ -793,7 +793,7 @@ function \txfm\()16_1d_4x16_pass1_neon push {lr} mov r12, #32 - vmov.s16 q2, #0 + vmov.s16 q2, #0 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 vld1.16 {d\i}, [r2,:64] vst1.16 {d4}, [r2,:64], r12 @@ -1142,7 +1142,7 @@ function idct32x32_dc_add_neon movrel r12, idct_coeffs vld1.16 {d0}, [r12,:64] - vmov.i16 q2, #0 + vmov.i16 q2, #0 vld1.16 {d16[]}, [r2,:16] vmull.s16 q8, d16, d0[0] @@ -1330,7 +1330,7 @@ function idct32_1d_4x32_pass1\suffix\()_neon @ Double stride of the input, since we only read every other line mov r12, #128 - vmov.s16 d4, #0 + vmov.s16 d4, #0 @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30) .ifb \suffix @@ -1394,7 +1394,7 @@ function idct32_1d_4x32_pass1\suffix\()_neon .endif add r2, r2, #64 - vmov.s16 d8, #0 + vmov.s16 d8, #0 @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31) .ifb \suffix .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 @@ -1533,9 +1533,9 @@ function idct32_1d_4x32_pass2\suffix\()_neon .endif vld1.32 {d12[]}, [r0,:32], r1 vld1.32 {d12[1]}, [r0,:32], r1 - vrshr.s16 q4, q4, #6 + vrshr.s16 q4, q4, #6 vld1.32 {d13[]}, [r0,:32], r1 - vrshr.s16 q5, q5, #6 + vrshr.s16 q5, q5, #6 vld1.32 {d13[1]}, [r0,:32], r1 sub r0, r0, r1, lsl #2 vaddw.u8 q4, q4, d12 diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S index 8d44d58..4b36080 100644 --- a/libavcodec/arm/vp9lpf_neon.S +++ b/libavcodec/arm/vp9lpf_neon.S @@ -828,7 +828,7 @@ function ff_vp9_loop_filter_v_16_16_neon, export=1 endfunc function vp9_loop_filter_h_16_neon - sub r12, r0, #8 + sub r12, r0, #8 vld1.8 {d16}, [r12,:64], r1 vld1.8 {d24}, [r0, :64], r1 vld1.8 {d17}, [r12,:64], r1