From patchwork Wed Mar 8 10:00:43 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 2823 Delivered-To: ffmpegpatchwork@gmail.com Received: by 10.103.50.79 with SMTP id y76csp954729vsy; Wed, 8 Mar 2017 02:07:10 -0800 (PST) X-Received: by 10.223.162.155 with SMTP id s27mr4178744wra.159.1488967630758; Wed, 08 Mar 2017 02:07:10 -0800 (PST) Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100]) by mx.google.com with ESMTP id 7si22555493wms.86.2017.03.08.02.07.10; Wed, 08 Mar 2017 02:07:10 -0800 (PST) Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Authentication-Results: mx.google.com; dkim=neutral (body hash did not verify) header.i=@martin-st.20150623.gappssmtp.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id B710C6882D4; Wed, 8 Mar 2017 12:06:54 +0200 (EET) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from mail-lf0-f66.google.com (mail-lf0-f66.google.com [209.85.215.66]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 9985B6809EB for ; Wed, 8 Mar 2017 12:06:48 +0200 (EET) Received: by mail-lf0-f66.google.com with SMTP id g70so1995674lfh.3 for ; Wed, 08 Mar 2017 02:07:02 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=martin-st.20150623.gappssmtp.com; s=20150623; h=from:to:subject:date:message-id:in-reply-to:references; bh=K6v3N6J1WPKYOdMp9MDfbcYmPLmqbXvD2nXDOewsDPQ=; b=us7ByLCaJdbZ1XNnxgmuNz/W0Eq/cV2FElGucpx8PqTSiAx4yhimcGdTI5cALtRmbF yqO+XXep76avJrH8Szkxg35U3yDJYrzLF1Rr5ofJXPOAKkfgznMvTPq9HJcFNBAKLlyh AbDBCXWv7qH1D9QXSDblFDm/9sSu1Eeb3sE6f56hY8vY7kxqAdcCAUAIUb3WIhEAaQyw 47ceeLf26zHZOAAzeh+qj4fYYNYbNroENVf/NV3j7+iTN30JSBp1EklwSDzVWkyN8D1n B7U0V7k1tvhIE63tuMuSm277QwGuOqDpPg7T6X2MckautrBN+4tzRhKmcO4r4fNH/D9b HJNw== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:subject:date:message-id:in-reply-to :references; bh=K6v3N6J1WPKYOdMp9MDfbcYmPLmqbXvD2nXDOewsDPQ=; b=fw71Av3kNlvn7Zr56bPCtm9Id8xrzH8ITZa9bVl18BPjFOj5aV1+Nba9psqnCJy+9I U8bdPEiZGCm9QkofZB18hQMpX7Hb0t2WtDcKNehSm9MRcD0ucGaJjPzJJuVpkF2rAPgi ZDT9via/WDseej0jsS8uuJqf5tnB+aIQuO1QCrf04oMttfVt/ROX3FmZefFS0KEbuaDI xHxOnFz29JDW0KzgjcRlePcmDOGoD9Z7H2AERcOF9uQgtoXElu0aG1U5LevMMLszk9WQ jGgZnIs+/qqJm2wkMgwaooH1ULh4waGpeb8iDgYuLS1Wq9wb2AbmfyRzEJZXial/KMxL RgVQ== X-Gm-Message-State: AMke39mO0g5cn3Y562yEhsicXgpOMFL/xJ+A8TzHpHQZKL3MO2xSX1HmzveraLQu7rElLA== X-Received: by 10.25.157.65 with SMTP id g62mr1496787lfe.29.1488967278902; Wed, 08 Mar 2017 02:01:18 -0800 (PST) Received: from localhost.localdomain ([2001:470:28:852:7d47:68e:13e8:4933]) by smtp.gmail.com with ESMTPSA id m127sm513064lfg.58.2017.03.08.02.01.18 for (version=TLS1_2 cipher=ECDHE-RSA-AES128-SHA bits=128/128); Wed, 08 Mar 2017 02:01:18 -0800 (PST) From: =?UTF-8?q?Martin=20Storsj=C3=B6?= To: ffmpeg-devel@ffmpeg.org Date: Wed, 8 Mar 2017 12:00:43 +0200 Message-Id: <1488967274-8143-3-git-send-email-martin@martin.st> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1488967274-8143-1-git-send-email-martin@martin.st> References: <1488967274-8143-1-git-send-email-martin@martin.st> Subject: [FFmpeg-devel] [PATCH 03/34] arm: vp9itxfm: Make the larger core transforms standalone functions X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches MIME-Version: 1.0 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" This work is sponsored by, and copyright, Google. This reduces the code size of libavcodec/arm/vp9itxfm_neon.o from 15324 to 12388 bytes. This gives a small slowdown of a couple tens of cycles, up to around 150 cycles for the full case of the largest transform, but makes it more feasible to add more optimized versions of these transforms. Before: Cortex A7 A8 A9 A53 vp9_inv_dct_dct_16x16_sub4_add_neon: 2063.4 1516.0 1719.5 1245.1 vp9_inv_dct_dct_16x16_sub16_add_neon: 3279.3 2454.5 2525.2 1982.3 vp9_inv_dct_dct_32x32_sub4_add_neon: 10750.0 7955.4 8525.6 6754.2 vp9_inv_dct_dct_32x32_sub32_add_neon: 18574.0 17108.4 14216.7 12010.2 After: vp9_inv_dct_dct_16x16_sub4_add_neon: 2060.8 1608.5 1735.7 1262.0 vp9_inv_dct_dct_16x16_sub16_add_neon: 3211.2 2443.5 2546.1 1999.5 vp9_inv_dct_dct_32x32_sub4_add_neon: 10682.0 8043.8 8581.3 6810.1 vp9_inv_dct_dct_32x32_sub32_add_neon: 18522.4 17277.4 14286.7 12087.9 This is cherrypicked from libav commit 0331c3f5e8cb6e6b53fab7893e91d1be1bfa979c. --- libavcodec/arm/vp9itxfm_neon.S | 43 +++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S index 93816d2..328bb01 100644 --- a/libavcodec/arm/vp9itxfm_neon.S +++ b/libavcodec/arm/vp9itxfm_neon.S @@ -534,7 +534,7 @@ function idct16x16_dc_add_neon endfunc .ltorg -.macro idct16 +function idct16 mbutterfly0 d16, d24, d16, d24, d4, d6, q2, q3 @ d16 = t0a, d24 = t1a mbutterfly d20, d28, d0[1], d0[2], q2, q3 @ d20 = t2a, d28 = t3a mbutterfly d18, d30, d0[3], d1[0], q2, q3 @ d18 = t4a, d30 = t7a @@ -580,9 +580,10 @@ endfunc vmov d4, d21 @ d4 = t10a butterfly d20, d27, d6, d27 @ d20 = out[4], d27 = out[11] butterfly d21, d26, d26, d4 @ d21 = out[5], d26 = out[10] -.endm + bx lr +endfunc -.macro iadst16 +function iadst16 movrel r12, iadst16_coeffs vld1.16 {q0-q1}, [r12,:128] @@ -653,7 +654,8 @@ endfunc vmov d16, d2 vmov d30, d4 -.endm + bx lr +endfunc .macro itxfm16_1d_funcs txfm @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, @@ -662,6 +664,8 @@ endfunc @ r1 = slice offset @ r2 = src function \txfm\()16_1d_4x16_pass1_neon + push {lr} + mov r12, #32 vmov.s16 q2, #0 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 @@ -669,7 +673,7 @@ function \txfm\()16_1d_4x16_pass1_neon vst1.16 {d4}, [r2,:64], r12 .endr - \txfm\()16 + bl \txfm\()16 @ Do four 4x4 transposes. Originally, d16-d31 contain the @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31 @@ -682,7 +686,7 @@ function \txfm\()16_1d_4x16_pass1_neon .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 vst1.16 {d\i}, [r0,:64]! .endr - bx lr + pop {pc} 1: @ Special case: For the last input column (r1 == 12), @ which would be stored as the last row in the temp buffer, @@ -709,7 +713,7 @@ function \txfm\()16_1d_4x16_pass1_neon vmov d29, d17 vmov d30, d18 vmov d31, d19 - bx lr + pop {pc} endfunc @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, @@ -719,6 +723,7 @@ endfunc @ r2 = src (temp buffer) @ r3 = slice offset function \txfm\()16_1d_4x16_pass2_neon + push {lr} mov r12, #32 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 vld1.16 {d\i}, [r2,:64], r12 @@ -732,7 +737,7 @@ function \txfm\()16_1d_4x16_pass2_neon add r3, r0, r1 lsl r1, r1, #1 - \txfm\()16 + bl \txfm\()16 .macro load_add_store coef0, coef1, coef2, coef3 vrshr.s16 \coef0, \coef0, #6 @@ -773,7 +778,7 @@ function \txfm\()16_1d_4x16_pass2_neon load_add_store q12, q13, q14, q15 .purgem load_add_store - bx lr + pop {pc} endfunc .endm @@ -908,7 +913,7 @@ function idct32x32_dc_add_neon bx lr endfunc -.macro idct32_odd +function idct32_odd movrel r12, idct_coeffs add r12, r12, #32 vld1.16 {q0-q1}, [r12,:128] @@ -967,7 +972,8 @@ endfunc mbutterfly0 d26, d21, d26, d21, d4, d6, q2, q3 @ d26 = t26a, d21 = t21a mbutterfly0 d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25, d22 = t22 mbutterfly0 d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 = t23a -.endm + bx lr +endfunc @ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix. @ We don't have register space to do a single pass IDCT of 4x32 though, @@ -979,6 +985,8 @@ endfunc @ r1 = unused @ r2 = src function idct32_1d_4x32_pass1_neon + push {lr} + movrel r12, idct_coeffs vld1.16 {q0-q1}, [r12,:128] @@ -992,7 +1000,7 @@ function idct32_1d_4x32_pass1_neon vst1.16 {d4}, [r2,:64], r12 .endr - idct16 + bl idct16 @ Do four 4x4 transposes. Originally, d16-d31 contain the @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31 @@ -1028,7 +1036,7 @@ function idct32_1d_4x32_pass1_neon vst1.16 {d4}, [r2,:64], r12 .endr - idct32_odd + bl idct32_odd transpose16_q_4x_4x4 q15, q14, q13, q12, q11, q10, q9, q8, d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16 @@ -1054,7 +1062,7 @@ function idct32_1d_4x32_pass1_neon store_rev 29, 25, 21, 17 store_rev 28, 24, 20, 16 .purgem store_rev - bx lr + pop {pc} endfunc .ltorg @@ -1065,6 +1073,7 @@ endfunc @ r1 = dst stride @ r2 = src (temp buffer) function idct32_1d_4x32_pass2_neon + push {lr} movrel r12, idct_coeffs vld1.16 {q0-q1}, [r12,:128] @@ -1075,7 +1084,7 @@ function idct32_1d_4x32_pass2_neon .endr sub r2, r2, r12, lsl #4 - idct16 + bl idct16 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 vst1.16 {d\i}, [r2,:64], r12 @@ -1091,7 +1100,7 @@ function idct32_1d_4x32_pass2_neon sub r2, r2, r12, lsl #4 sub r2, r2, #64 - idct32_odd + bl idct32_odd mov r12, #128 .macro load_acc_store a, b, c, d, neg=0 @@ -1139,7 +1148,7 @@ function idct32_1d_4x32_pass2_neon load_acc_store 24, 25, 26, 27, 1 load_acc_store 28, 29, 30, 31, 1 .purgem load_acc_store - bx lr + pop {pc} endfunc const min_eob_idct_idct_32, align=4