From patchwork Thu Mar 16 22:10:16 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 2970 Delivered-To: ffmpegpatchwork@gmail.com Received: by 10.103.50.79 with SMTP id y76csp3838vsy; Thu, 16 Mar 2017 15:18:12 -0700 (PDT) X-Received: by 10.223.147.66 with SMTP id 60mr10241418wro.173.1489702691989; Thu, 16 Mar 2017 15:18:11 -0700 (PDT) Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100]) by mx.google.com with ESMTP id h200si437707wmd.148.2017.03.16.15.18.11; Thu, 16 Mar 2017 15:18:11 -0700 (PDT) Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Authentication-Results: mx.google.com; dkim=neutral (body hash did not verify) header.i=@martin-st.20150623.gappssmtp.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id C090F68834B; Fri, 17 Mar 2017 00:17:50 +0200 (EET) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from mail-lf0-f65.google.com (mail-lf0-f65.google.com [209.85.215.65]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 73AAC688329 for ; Fri, 17 Mar 2017 00:17:44 +0200 (EET) Received: by mail-lf0-f65.google.com with SMTP id r36so4324544lfi.0 for ; Thu, 16 Mar 2017 15:18:01 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=martin-st.20150623.gappssmtp.com; s=20150623; h=from:to:subject:date:message-id:in-reply-to:references; bh=p2RhVtysMfdb16rK9kSLiTmt9Qim2YCadjzI1kvZudU=; b=03Zs50r2Oo6PtQ8IJcKZOFTGnzLsSy54A6zUklmVV4elv5jLoB7gvMd4OhAmWW5Jhq +VcK/xHHpXH/u9kJax7SshdecaTCMPQV0fy2Obj4kmmxy7HcV2C+mKStxzEVXNAsApqf +gvO/dci06S9xkIGEGxDYW8B/vqGK9HGorh/QUpZ0Bgfl/Ts2/PGF2wiJRZiYrvTjDuE GY56gDdnTEuUJ/oh+tjRknABgSPmr0UVJv9xP27aHuADxTNT5OtZOViuTW3oCSYY08+t n8KMAzVWg/T/+rwMWO16SF3BSzphNjpYA/wR5LslsRjmpU/lPDhgDId8ZhmJkMI+UJaU S4tA== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:subject:date:message-id:in-reply-to :references; bh=p2RhVtysMfdb16rK9kSLiTmt9Qim2YCadjzI1kvZudU=; b=BrJR4UYPzUyoHDxvscu/Zu0518qXr8a2WArIPwRdvF7XN38stttegeFE2sbzAlxuxr /yZe9ufodHTAqN9iRO4wYKF6yevd3ecFYFNNFqn6ZXDPo7rYursuj8eVctNbzeQpDpgn 4k5HffFK7fI+5YDMgEBy8BVzswVvrahqwJKUzFo8fhF+WPeq66ohwW+Wke1xx8e4flgG F1emRGfhS0de9uMeB7lWDvbASrw66l0FSFLVPzKeJFsCKvh5XHI6dYYD/XHjznCdZnQb rIXfHOrCmYv2TD/e4u0BdJxJAsOvjcq9qwFsehNylzCFlveTpxzL0ge2Ain2Hz9Yopzh qdpQ== X-Gm-Message-State: AFeK/H0O62CG2l4PTtUH4dkciNcGRzfDCfsJpSliRuYZYw8tXNMi9Suequo9jZIPWlkTEw== X-Received: by 10.25.155.132 with SMTP id d126mr3565318lfe.110.1489702230067; Thu, 16 Mar 2017 15:10:30 -0700 (PDT) Received: from localhost.localdomain ([2001:470:28:852:10ad:e858:1f3b:5c2c]) by smtp.gmail.com with ESMTPSA id g3sm1124718lfe.34.2017.03.16.15.10.29 for (version=TLS1_2 cipher=ECDHE-RSA-AES128-SHA bits=128/128); Thu, 16 Mar 2017 15:10:29 -0700 (PDT) From: =?UTF-8?q?Martin=20Storsj=C3=B6?= To: ffmpeg-devel@ffmpeg.org Date: Fri, 17 Mar 2017 00:10:16 +0200 Message-Id: <1489702219-12643-11-git-send-email-martin@martin.st> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1489702219-12643-1-git-send-email-martin@martin.st> References: <1489702219-12643-1-git-send-email-martin@martin.st> Subject: [FFmpeg-devel] [PATCH 11/14] aarch64: vp9itxfm16: Make the larger core transforms standalone functions X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches MIME-Version: 1.0 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" This work is sponsored by, and copyright, Google. This reduces the code size of libavcodec/aarch64/vp9itxfm_16bpp_neon.o from 26288 to 21512 bytes. This gives a small slowdown of a couple of tens of cycles, but makes it more feasible to add more optimized versions of these transforms. Before: vp9_inv_dct_dct_16x16_sub4_add_10_neon: 1887.4 vp9_inv_dct_dct_16x16_sub16_add_10_neon: 2801.5 vp9_inv_dct_dct_32x32_sub4_add_10_neon: 9691.4 vp9_inv_dct_dct_32x32_sub32_add_10_neon: 16154.9 After: vp9_inv_dct_dct_16x16_sub4_add_10_neon: 1899.5 vp9_inv_dct_dct_16x16_sub16_add_10_neon: 2827.2 vp9_inv_dct_dct_32x32_sub4_add_10_neon: 9714.7 vp9_inv_dct_dct_32x32_sub32_add_10_neon: 16175.9 --- libavcodec/aarch64/vp9itxfm_16bpp_neon.S | 45 ++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/libavcodec/aarch64/vp9itxfm_16bpp_neon.S b/libavcodec/aarch64/vp9itxfm_16bpp_neon.S index a97c1b6..de1da55 100644 --- a/libavcodec/aarch64/vp9itxfm_16bpp_neon.S +++ b/libavcodec/aarch64/vp9itxfm_16bpp_neon.S @@ -710,7 +710,7 @@ function idct16x16_dc_add_neon ret endfunc -.macro idct16 +function idct16 dmbutterfly0 v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a, v24 = t1a dmbutterfly v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a, v28 = t3a dmbutterfly v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a, v30 = t7a @@ -753,9 +753,10 @@ endfunc butterfly_4s v19, v28, v5, v28 // v19 = out[3], v28 = out[12] butterfly_4s v20, v27, v6, v27 // v20 = out[4], v27 = out[11] butterfly_4s v21, v26, v26, v9 // v21 = out[5], v26 = out[10] -.endm + ret +endfunc -.macro iadst16 +function iadst16 ld1 {v0.8h,v1.8h}, [x11] sxtl v2.4s, v1.4h sxtl2 v3.4s, v1.8h @@ -830,7 +831,8 @@ endfunc mov v16.16b, v2.16b mov v30.16b, v4.16b -.endm + ret +endfunc // Helper macros; we can't use these expressions directly within // e.g. .irp due to the extra concatenation \(). Therefore wrap @@ -857,12 +859,14 @@ endfunc // x9 = input stride .macro itxfm16_1d_funcs txfm function \txfm\()16_1d_4x16_pass1_neon + mov x14, x30 + movi v4.4s, #0 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 load_clear \i, x2, x9 .endr - \txfm\()16 + bl \txfm\()16 // Do four 4x4 transposes. Originally, v16-v31 contain the // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31 @@ -878,7 +882,7 @@ function \txfm\()16_1d_4x16_pass1_neon .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 store \i, x0, #16 .endr - ret + br x14 1: // Special case: For the last input column (x1 == 12), // which would be stored as the last row in the temp buffer, @@ -906,7 +910,7 @@ function \txfm\()16_1d_4x16_pass1_neon mov v29.16b, v17.16b mov v30.16b, v18.16b mov v31.16b, v19.16b - ret + br x14 endfunc // Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, @@ -917,6 +921,8 @@ endfunc // x3 = slice offset // x9 = temp buffer stride function \txfm\()16_1d_4x16_pass2_neon + mov x14, x30 + .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 load \i, x2, x9 .endr @@ -928,7 +934,7 @@ function \txfm\()16_1d_4x16_pass2_neon add x3, x0, x1 lsl x1, x1, #1 - \txfm\()16 + bl \txfm\()16 dup v8.8h, w13 .macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7 @@ -983,7 +989,7 @@ function \txfm\()16_1d_4x16_pass2_neon load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s .purgem load_add_store - ret + br x14 endfunc .endm @@ -1158,7 +1164,7 @@ function idct32x32_dc_add_neon ret endfunc -.macro idct32_odd +function idct32_odd dmbutterfly v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a dmbutterfly v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a dmbutterfly v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a @@ -1209,7 +1215,8 @@ endfunc dmbutterfly0 v26, v21, v26, v21, v4, v5, v6, v7, v8, v9 // v26 = t26a, v21 = t21a dmbutterfly0 v25, v22, v25, v22, v4, v5, v6, v7, v8, v9 // v25 = t25, v22 = t22 dmbutterfly0 v24, v23, v24, v23, v4, v5, v6, v7, v8, v9 // v24 = t24a, v23 = t23a -.endm + ret +endfunc // Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix. // The 32-point IDCT can be decomposed into two 16-point IDCTs; @@ -1221,6 +1228,8 @@ endfunc // x2 = src // x9 = double input stride function idct32_1d_4x32_pass1_neon + mov x14, x30 + movi v4.4s, #0 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) @@ -1229,7 +1238,7 @@ function idct32_1d_4x32_pass1_neon st1 {v4.4s}, [x2], x9 .endr - idct16 + bl idct16 // Do four 4x4 transposes. Originally, v16-v31 contain the // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31 @@ -1280,7 +1289,7 @@ function idct32_1d_4x32_pass1_neon st1 {v4.4s}, [x2], x9 .endr - idct32_odd + bl idct32_odd transpose_4x4s v31, v30, v29, v28, v4, v5, v6, v7 transpose_4x4s v27, v26, v25, v24, v4, v5, v6, v7 @@ -1330,7 +1339,7 @@ function idct32_1d_4x32_pass1_neon store_rev v29.4s, v25.4s, v21.4s, v17.4s, v29.16b, v25.16b store_rev v28.4s, v24.4s, v20.4s, v16.4s, v28.16b, v24.16b .purgem store_rev - ret + br x14 endfunc // This is mostly the same as 4x32_pass1, but without the transpose, @@ -1342,13 +1351,15 @@ endfunc // x7 = negative double temp buffer stride // x9 = double temp buffer stride function idct32_1d_4x32_pass2_neon + mov x14, x30 + // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ld1 {v\i\().4s}, [x2], x9 .endr sub x2, x2, x9, lsl #4 - idct16 + bl idct16 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 st1 {v\i\().4s}, [x2], x9 @@ -1364,7 +1375,7 @@ function idct32_1d_4x32_pass2_neon sub x2, x2, x9, lsl #4 sub x2, x2, #128 - idct32_odd + bl idct32_odd .macro load_acc_store a, b, c, d, neg=0 .if \neg == 0 @@ -1420,7 +1431,7 @@ function idct32_1d_4x32_pass2_neon load_acc_store v24.4s, v25.4s, v26.4s, v27.4s, 1 load_acc_store v28.4s, v29.4s, v30.4s, v31.4s, 1 .purgem load_acc_store - ret + br x14 endfunc const min_eob_idct_idct_32, align=4