From patchwork Thu Mar 16 22:10:14 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 2969 Delivered-To: ffmpegpatchwork@gmail.com Received: by 10.103.50.79 with SMTP id y76csp3618vsy; Thu, 16 Mar 2017 15:17:30 -0700 (PDT) X-Received: by 10.28.209.202 with SMTP id i193mr50184wmg.17.1489702650304; Thu, 16 Mar 2017 15:17:30 -0700 (PDT) Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100]) by mx.google.com with ESMTP id h28si439293wmi.157.2017.03.16.15.17.28; Thu, 16 Mar 2017 15:17:30 -0700 (PDT) Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Authentication-Results: mx.google.com; dkim=neutral (body hash did not verify) header.i=@martin-st.20150623.gappssmtp.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id DA9E668833D; Fri, 17 Mar 2017 00:17:09 +0200 (EET) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from mail-lf0-f66.google.com (mail-lf0-f66.google.com [209.85.215.66]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 61094688283 for ; Fri, 17 Mar 2017 00:17:03 +0200 (EET) Received: by mail-lf0-f66.google.com with SMTP id y193so4312010lfd.1 for ; Thu, 16 Mar 2017 15:17:20 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=martin-st.20150623.gappssmtp.com; s=20150623; h=from:to:subject:date:message-id:in-reply-to:references; bh=HRFl7KNAJIB2jEsoAHoZslXkbJBAsoTDfPPF41x2h7U=; b=BWI9C0NH3qkV3GKlpm8J+wxNfnhx2kXvz2P9BZctQqw2UJmT4BmQd4KzIbmxZk5rAx KnQJMEOhEFM6ymPloZQhtOnXqGGK57CcrSvnR4V8V9JwEWKGV6fB52pK0zHoASOpYtfa D9DUyz//F17Q7eF+Ean4qp0VqaAOXcisa2BPI34GVY7CM33PiazknDijy1ZfBikamzHT T0c6ZSdNTW5F6sDiCVNS88SYgA9gfetv/aY6iLkzkYE1gRHSBtrGZm4Qf2QRTVz4AWil ZV19bLS0ERYwpBkas33E3SKrHZ9Sc8IRtbbrYBUFQvaBOteZ+9w4ENrQ+ARwWkEooSqF yahQ== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:subject:date:message-id:in-reply-to :references; bh=HRFl7KNAJIB2jEsoAHoZslXkbJBAsoTDfPPF41x2h7U=; b=NRABDakHh1nh8tlwCupC21h+RoaBJ/BQV9ZhYKWrYj+Lq144JcNVQJn/MTsLQJhXb2 kgRSi4sdC2E6XyzGRcVpGkqOTgOQDEe/c7D15KGebEGfMJ2OJooUVxFuEpEsog/unjui Gq2oWib5pYoyLAmrLYZuqDzlYoUhgcv0XwDkiYd+aaLZIagiMfAosNd8FcypSHJ+5eJI t9P53uviPbXgGDhanO4Akg4CWubzDnenoqDMGme/9pr027lN3vFyECOQLCEW7oQFeezI aA0XXqY42wQYnWdQtfmdcEdc0sbx8o0R/kAaSHHc7UC8p2t52fR6xcjQqxi//e7rN+yo JBeQ== X-Gm-Message-State: AFeK/H2q6Gy2sccFqs2/VeJxaUk7iFeBEr3nMFCGhy4ci5SbNjJfzCwbEezlQq+Nx7+8yw== X-Received: by 10.46.69.7 with SMTP id s7mr3868092lja.42.1489702228508; Thu, 16 Mar 2017 15:10:28 -0700 (PDT) Received: from localhost.localdomain ([2001:470:28:852:10ad:e858:1f3b:5c2c]) by smtp.gmail.com with ESMTPSA id g3sm1124718lfe.34.2017.03.16.15.10.27 for (version=TLS1_2 cipher=ECDHE-RSA-AES128-SHA bits=128/128); Thu, 16 Mar 2017 15:10:28 -0700 (PDT) From: =?UTF-8?q?Martin=20Storsj=C3=B6?= To: ffmpeg-devel@ffmpeg.org Date: Fri, 17 Mar 2017 00:10:14 +0200 Message-Id: <1489702219-12643-9-git-send-email-martin@martin.st> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1489702219-12643-1-git-send-email-martin@martin.st> References: <1489702219-12643-1-git-send-email-martin@martin.st> Subject: [FFmpeg-devel] [PATCH 09/14] aarch64: vp9itxfm16: Restructure the idct32 store macros X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches MIME-Version: 1.0 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" This avoids concatenation, which can't be used if the whole macro is wrapped within another macro. --- libavcodec/aarch64/vp9itxfm_16bpp_neon.S | 90 ++++++++++++++++---------------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/libavcodec/aarch64/vp9itxfm_16bpp_neon.S b/libavcodec/aarch64/vp9itxfm_16bpp_neon.S index 86ea29e..a97c1b6 100644 --- a/libavcodec/aarch64/vp9itxfm_16bpp_neon.S +++ b/libavcodec/aarch64/vp9itxfm_16bpp_neon.S @@ -1244,27 +1244,27 @@ function idct32_1d_4x32_pass1_neon .macro store_rev a, b, c, d // There's no rev128 instruction, but we reverse each 64 bit // half, and then flip them using an ext with 8 bytes offset. - rev64 v7.4s, v\d\().4s - st1 {v\a\().4s}, [x0], #16 + rev64 v7.4s, \d + st1 {\a}, [x0], #16 ext v7.16b, v7.16b, v7.16b, #8 - st1 {v\b\().4s}, [x0], #16 - rev64 v6.4s, v\c\().4s - st1 {v\c\().4s}, [x0], #16 + st1 {\b}, [x0], #16 + rev64 v6.4s, \c + st1 {\c}, [x0], #16 ext v6.16b, v6.16b, v6.16b, #8 - st1 {v\d\().4s}, [x0], #16 - rev64 v5.4s, v\b\().4s + st1 {\d}, [x0], #16 + rev64 v5.4s, \b st1 {v7.4s}, [x0], #16 ext v5.16b, v5.16b, v5.16b, #8 st1 {v6.4s}, [x0], #16 - rev64 v4.4s, v\a\().4s + rev64 v4.4s, \a st1 {v5.4s}, [x0], #16 ext v4.16b, v4.16b, v4.16b, #8 st1 {v4.4s}, [x0], #16 .endm - store_rev 16, 20, 24, 28 - store_rev 17, 21, 25, 29 - store_rev 18, 22, 26, 30 - store_rev 19, 23, 27, 31 + store_rev v16.4s, v20.4s, v24.4s, v28.4s + store_rev v17.4s, v21.4s, v25.4s, v29.4s + store_rev v18.4s, v22.4s, v26.4s, v30.4s + store_rev v19.4s, v23.4s, v27.4s, v31.4s sub x0, x0, #512 .purgem store_rev @@ -1290,27 +1290,27 @@ function idct32_1d_4x32_pass1_neon // Store the registers a, b, c, d horizontally, // adding into the output first, and the mirrored, // subtracted from the output. -.macro store_rev a, b, c, d +.macro store_rev a, b, c, d, a16b, b16b ld1 {v4.4s}, [x0] - rev64 v9.4s, v\d\().4s - add v4.4s, v4.4s, v\a\().4s + rev64 v9.4s, \d + add v4.4s, v4.4s, \a st1 {v4.4s}, [x0], #16 - rev64 v8.4s, v\c\().4s + rev64 v8.4s, \c ld1 {v4.4s}, [x0] ext v9.16b, v9.16b, v9.16b, #8 - add v4.4s, v4.4s, v\b\().4s + add v4.4s, v4.4s, \b st1 {v4.4s}, [x0], #16 ext v8.16b, v8.16b, v8.16b, #8 ld1 {v4.4s}, [x0] - rev64 v\b\().4s, v\b\().4s - add v4.4s, v4.4s, v\c\().4s + rev64 \b, \b + add v4.4s, v4.4s, \c st1 {v4.4s}, [x0], #16 - rev64 v\a\().4s, v\a\().4s + rev64 \a, \a ld1 {v4.4s}, [x0] - ext v\b\().16b, v\b\().16b, v\b\().16b, #8 - add v4.4s, v4.4s, v\d\().4s + ext \b16b, \b16b, \b16b, #8 + add v4.4s, v4.4s, \d st1 {v4.4s}, [x0], #16 - ext v\a\().16b, v\a\().16b, v\a\().16b, #8 + ext \a16b, \a16b, \a16b, #8 ld1 {v4.4s}, [x0] sub v4.4s, v4.4s, v9.4s st1 {v4.4s}, [x0], #16 @@ -1318,17 +1318,17 @@ function idct32_1d_4x32_pass1_neon sub v4.4s, v4.4s, v8.4s st1 {v4.4s}, [x0], #16 ld1 {v4.4s}, [x0] - sub v4.4s, v4.4s, v\b\().4s + sub v4.4s, v4.4s, \b st1 {v4.4s}, [x0], #16 ld1 {v4.4s}, [x0] - sub v4.4s, v4.4s, v\a\().4s + sub v4.4s, v4.4s, \a st1 {v4.4s}, [x0], #16 .endm - store_rev 31, 27, 23, 19 - store_rev 30, 26, 22, 18 - store_rev 29, 25, 21, 17 - store_rev 28, 24, 20, 16 + store_rev v31.4s, v27.4s, v23.4s, v19.4s, v31.16b, v27.16b + store_rev v30.4s, v26.4s, v22.4s, v18.4s, v30.16b, v26.16b + store_rev v29.4s, v25.4s, v21.4s, v17.4s, v29.16b, v25.16b + store_rev v28.4s, v24.4s, v20.4s, v16.4s, v28.16b, v24.16b .purgem store_rev ret endfunc @@ -1370,21 +1370,21 @@ function idct32_1d_4x32_pass2_neon .if \neg == 0 ld1 {v4.4s}, [x2], x9 ld1 {v5.4s}, [x2], x9 - add v4.4s, v4.4s, v\a\().4s + add v4.4s, v4.4s, \a ld1 {v6.4s}, [x2], x9 - add v5.4s, v5.4s, v\b\().4s + add v5.4s, v5.4s, \b ld1 {v7.4s}, [x2], x9 - add v6.4s, v6.4s, v\c\().4s - add v7.4s, v7.4s, v\d\().4s + add v6.4s, v6.4s, \c + add v7.4s, v7.4s, \d .else ld1 {v4.4s}, [x2], x7 ld1 {v5.4s}, [x2], x7 - sub v4.4s, v4.4s, v\a\().4s + sub v4.4s, v4.4s, \a ld1 {v6.4s}, [x2], x7 - sub v5.4s, v5.4s, v\b\().4s + sub v5.4s, v5.4s, \b ld1 {v7.4s}, [x2], x7 - sub v6.4s, v6.4s, v\c\().4s - sub v7.4s, v7.4s, v\d\().4s + sub v6.4s, v6.4s, \c + sub v7.4s, v7.4s, \d .endif ld1 {v8.4h}, [x0], x1 ld1 {v8.d}[1], [x0], x1 @@ -1410,15 +1410,15 @@ function idct32_1d_4x32_pass2_neon st1 {v5.4h}, [x0], x1 st1 {v5.d}[1], [x0], x1 .endm - load_acc_store 31, 30, 29, 28 - load_acc_store 27, 26, 25, 24 - load_acc_store 23, 22, 21, 20 - load_acc_store 19, 18, 17, 16 + load_acc_store v31.4s, v30.4s, v29.4s, v28.4s + load_acc_store v27.4s, v26.4s, v25.4s, v24.4s + load_acc_store v23.4s, v22.4s, v21.4s, v20.4s + load_acc_store v19.4s, v18.4s, v17.4s, v16.4s sub x2, x2, x9 - load_acc_store 16, 17, 18, 19, 1 - load_acc_store 20, 21, 22, 23, 1 - load_acc_store 24, 25, 26, 27, 1 - load_acc_store 28, 29, 30, 31, 1 + load_acc_store v16.4s, v17.4s, v18.4s, v19.4s, 1 + load_acc_store v20.4s, v21.4s, v22.4s, v23.4s, 1 + load_acc_store v24.4s, v25.4s, v26.4s, v27.4s, 1 + load_acc_store v28.4s, v29.4s, v30.4s, v31.4s, 1 .purgem load_acc_store ret endfunc