From patchwork Wed Mar 8 10:00:42 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 2801 Delivered-To: ffmpegpatchwork@gmail.com Received: by 10.103.50.79 with SMTP id y76csp952657vsy; Wed, 8 Mar 2017 02:01:40 -0800 (PST) X-Received: by 10.223.174.131 with SMTP id y3mr4478818wrc.40.1488967300208; Wed, 08 Mar 2017 02:01:40 -0800 (PST) Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100]) by mx.google.com with ESMTP id q4si3642229wrc.328.2017.03.08.02.01.39; Wed, 08 Mar 2017 02:01:40 -0800 (PST) Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Authentication-Results: mx.google.com; dkim=neutral (body hash did not verify) header.i=@martin-st.20150623.gappssmtp.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 2C15068827A; Wed, 8 Mar 2017 12:01:12 +0200 (EET) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from mail-lf0-f45.google.com (mail-lf0-f45.google.com [209.85.215.45]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 75676680296 for ; Wed, 8 Mar 2017 12:01:05 +0200 (EET) Received: by mail-lf0-f45.google.com with SMTP id y193so12354978lfd.3 for ; Wed, 08 Mar 2017 02:01:19 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=martin-st.20150623.gappssmtp.com; s=20150623; h=from:to:subject:date:message-id:in-reply-to:references; bh=+4bB3+u5e2UbW73JmEBwi/vLP30x/8QDHHHg6xbByBo=; b=HocB/iyWZVKiqN1YJ8i/LC1KO3zQmxrFfXwnzSzwk4k/kNMjOOoLxibqL3xxmYAxPA wduNAwxnrZOmA4nh1muIg9XTsh/uLOdcbrD23mufP/c0hzvMmdXP0MsW5Q9B7v3DM0Wk Xw/dHLDOk0oCsu5/NgJIpS3J4IqozmSHag/Xiqotxpc/QeeiJhRhnSq8X0fYqzlvZYZK KIecK/eTDTmmYeLK6X19tKv1UqGgfkOYiRV8N0CLqz70spHDt1FsqDh6CW46mw35Y06j 5BBPxKPdrhhHWtsKl0RUPilo/hU8Sveq6YITSZFIRRxU9JdsmZfHoAcQCWmVEZTwqL5w tVKw== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:subject:date:message-id:in-reply-to :references; bh=+4bB3+u5e2UbW73JmEBwi/vLP30x/8QDHHHg6xbByBo=; b=mGBNiTEdnJmIVGMD+0cU7KgC7a59xY2l/Q3xro2HR6VVC0XIcRhFEJSWPqZ2tBHiBq B4eVKUPSdYg6R+/dwt9qJD2Ul4G99cenEVpxwrO9VnqoAzcPDILDKRtktxlvidTIVK7y uKyPbIEvrbLIZEV1nx1aDcxxDXgFTyQUqbmcpVyGuPKMB2eUANueMWHM9LxIF8Qdsm32 f5X0H5z/pwngnS7sgdoLbc/wX1EIJTK7IRWOW+8HU23TdPZV2aNaebO8DW/+DKF8tOh+ 2uu4Zxl+YaWka9jajgUTkDCj0fd63gtJ5EEfXj3fcYW023X4dFyGgfnohxl6b0/5yzJ2 WxsA== X-Gm-Message-State: AMke39lx3uWesYPYgkcWamrWXZjxnNC8/5YcgpGt5M59Nd0i63+GjwmIup271NwFacTjWw== X-Received: by 10.25.216.232 with SMTP id r101mr1306311lfi.28.1488967278113; Wed, 08 Mar 2017 02:01:18 -0800 (PST) Received: from localhost.localdomain ([2001:470:28:852:7d47:68e:13e8:4933]) by smtp.gmail.com with ESMTPSA id m127sm513064lfg.58.2017.03.08.02.01.17 for (version=TLS1_2 cipher=ECDHE-RSA-AES128-SHA bits=128/128); Wed, 08 Mar 2017 02:01:17 -0800 (PST) From: =?UTF-8?q?Martin=20Storsj=C3=B6?= To: ffmpeg-devel@ffmpeg.org Date: Wed, 8 Mar 2017 12:00:42 +0200 Message-Id: <1488967274-8143-2-git-send-email-martin@martin.st> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1488967274-8143-1-git-send-email-martin@martin.st> References: <1488967274-8143-1-git-send-email-martin@martin.st> Subject: [FFmpeg-devel] [PATCH 02/34] aarch64: vp9itxfm: Restructure the idct32 store macros X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches MIME-Version: 1.0 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" This avoids concatenation, which can't be used if the whole macro is wrapped within another macro. This is also arguably more readable. This is cherrypicked from libav commit 58d87e0f49bcbbc6f426328f53b657bae7430cd2. --- libavcodec/aarch64/vp9itxfm_neon.S | 80 +++++++++++++++++++------------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S index 82f1f41..7427963 100644 --- a/libavcodec/aarch64/vp9itxfm_neon.S +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -935,23 +935,23 @@ function idct32_1d_8x32_pass1_neon .macro store_rev a, b // There's no rev128 instruction, but we reverse each 64 bit // half, and then flip them using an ext with 8 bytes offset. - rev64 v1.8h, v\b\().8h - st1 {v\a\().8h}, [x0], #16 - rev64 v0.8h, v\a\().8h + rev64 v1.8h, \b + st1 {\a}, [x0], #16 + rev64 v0.8h, \a ext v1.16b, v1.16b, v1.16b, #8 - st1 {v\b\().8h}, [x0], #16 + st1 {\b}, [x0], #16 ext v0.16b, v0.16b, v0.16b, #8 st1 {v1.8h}, [x0], #16 st1 {v0.8h}, [x0], #16 .endm - store_rev 16, 24 - store_rev 17, 25 - store_rev 18, 26 - store_rev 19, 27 - store_rev 20, 28 - store_rev 21, 29 - store_rev 22, 30 - store_rev 23, 31 + store_rev v16.8h, v24.8h + store_rev v17.8h, v25.8h + store_rev v18.8h, v26.8h + store_rev v19.8h, v27.8h + store_rev v20.8h, v28.8h + store_rev v21.8h, v29.8h + store_rev v22.8h, v30.8h + store_rev v23.8h, v31.8h sub x0, x0, #512 .purgem store_rev @@ -977,14 +977,14 @@ function idct32_1d_8x32_pass1_neon // subtracted from the output. .macro store_rev a, b ld1 {v4.8h}, [x0] - rev64 v1.8h, v\b\().8h - add v4.8h, v4.8h, v\a\().8h - rev64 v0.8h, v\a\().8h + rev64 v1.8h, \b + add v4.8h, v4.8h, \a + rev64 v0.8h, \a st1 {v4.8h}, [x0], #16 ext v1.16b, v1.16b, v1.16b, #8 ld1 {v5.8h}, [x0] ext v0.16b, v0.16b, v0.16b, #8 - add v5.8h, v5.8h, v\b\().8h + add v5.8h, v5.8h, \b st1 {v5.8h}, [x0], #16 ld1 {v6.8h}, [x0] sub v6.8h, v6.8h, v1.8h @@ -994,14 +994,14 @@ function idct32_1d_8x32_pass1_neon st1 {v7.8h}, [x0], #16 .endm - store_rev 31, 23 - store_rev 30, 22 - store_rev 29, 21 - store_rev 28, 20 - store_rev 27, 19 - store_rev 26, 18 - store_rev 25, 17 - store_rev 24, 16 + store_rev v31.8h, v23.8h + store_rev v30.8h, v22.8h + store_rev v29.8h, v21.8h + store_rev v28.8h, v20.8h + store_rev v27.8h, v19.8h + store_rev v26.8h, v18.8h + store_rev v25.8h, v17.8h + store_rev v24.8h, v16.8h .purgem store_rev ret endfunc @@ -1047,21 +1047,21 @@ function idct32_1d_8x32_pass2_neon .if \neg == 0 ld1 {v4.8h}, [x2], x9 ld1 {v5.8h}, [x2], x9 - add v4.8h, v4.8h, v\a\().8h + add v4.8h, v4.8h, \a ld1 {v6.8h}, [x2], x9 - add v5.8h, v5.8h, v\b\().8h + add v5.8h, v5.8h, \b ld1 {v7.8h}, [x2], x9 - add v6.8h, v6.8h, v\c\().8h - add v7.8h, v7.8h, v\d\().8h + add v6.8h, v6.8h, \c + add v7.8h, v7.8h, \d .else ld1 {v4.8h}, [x2], x7 ld1 {v5.8h}, [x2], x7 - sub v4.8h, v4.8h, v\a\().8h + sub v4.8h, v4.8h, \a ld1 {v6.8h}, [x2], x7 - sub v5.8h, v5.8h, v\b\().8h + sub v5.8h, v5.8h, \b ld1 {v7.8h}, [x2], x7 - sub v6.8h, v6.8h, v\c\().8h - sub v7.8h, v7.8h, v\d\().8h + sub v6.8h, v6.8h, \c + sub v7.8h, v7.8h, \d .endif ld1 {v0.8b}, [x0], x1 ld1 {v1.8b}, [x0], x1 @@ -1085,15 +1085,15 @@ function idct32_1d_8x32_pass2_neon st1 {v6.8b}, [x0], x1 st1 {v7.8b}, [x0], x1 .endm - load_acc_store 31, 30, 29, 28 - load_acc_store 27, 26, 25, 24 - load_acc_store 23, 22, 21, 20 - load_acc_store 19, 18, 17, 16 + load_acc_store v31.8h, v30.8h, v29.8h, v28.8h + load_acc_store v27.8h, v26.8h, v25.8h, v24.8h + load_acc_store v23.8h, v22.8h, v21.8h, v20.8h + load_acc_store v19.8h, v18.8h, v17.8h, v16.8h sub x2, x2, x9 - load_acc_store 16, 17, 18, 19, 1 - load_acc_store 20, 21, 22, 23, 1 - load_acc_store 24, 25, 26, 27, 1 - load_acc_store 28, 29, 30, 31, 1 + load_acc_store v16.8h, v17.8h, v18.8h, v19.8h, 1 + load_acc_store v20.8h, v21.8h, v22.8h, v23.8h, 1 + load_acc_store v24.8h, v25.8h, v26.8h, v27.8h, 1 + load_acc_store v28.8h, v29.8h, v30.8h, v31.8h, 1 .purgem load_acc_store ret endfunc