From patchwork Thu Mar 16 22:10:11 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 2965 Delivered-To: ffmpegpatchwork@gmail.com Received: by 10.103.50.79 with SMTP id y76csp3173vsy; Thu, 16 Mar 2017 15:16:05 -0700 (PDT) X-Received: by 10.28.54.135 with SMTP id y7mr35936wmh.113.1489702565233; Thu, 16 Mar 2017 15:16:05 -0700 (PDT) Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100]) by mx.google.com with ESMTP id j2si8376236wrd.206.2017.03.16.15.16.04; Thu, 16 Mar 2017 15:16:05 -0700 (PDT) Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Authentication-Results: mx.google.com; dkim=neutral (body hash did not verify) header.i=@martin-st.20150623.gappssmtp.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id C453768832D; Fri, 17 Mar 2017 00:15:38 +0200 (EET) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from mail-lf0-f67.google.com (mail-lf0-f67.google.com [209.85.215.67]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 0A06F6882DE for ; Fri, 17 Mar 2017 00:15:32 +0200 (EET) Received: by mail-lf0-f67.google.com with SMTP id v2so4313897lfi.2 for ; Thu, 16 Mar 2017 15:15:49 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=martin-st.20150623.gappssmtp.com; s=20150623; h=from:to:subject:date:message-id:in-reply-to:references; bh=iK5XYHEWBE7xx3MBkCDNHKtonAT3HK5h/2V7c/wK9fk=; b=fDCdK6khweYk8k6KZcraowI9ju+evjnDAZIDFsaCnelo2rwvn4Bs/sWwVTdouGJaME pMgQ6sJxNKsPQh4QH6nkcTauHo4TWuhEIkdbN3TEUQ2ylTXSYN2Qi9nDrrbwW/dvbdXR csOV8Hi10zBGpDNqCJxmpJxvwPOFa1kntxb1oB3uPSXfZxTcLv8YCxW0rYZl6OeokbDm pW3cpv9x7N3Z7TfHQo/p2LtGfzbTUD2ThNKr02jUT36jZ7lCv3B48q5JqE1CfAyat1VM +0Q0OykBESaDXiZZJqXNdtNNEG/VnWdxjQuT/8lS9biCMgSN+jGDzVJud+WfNzI8AYQt vI5g== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:subject:date:message-id:in-reply-to :references; bh=iK5XYHEWBE7xx3MBkCDNHKtonAT3HK5h/2V7c/wK9fk=; b=Zdp7vgOCWjqggouToGj+Ejkqv7B4tURtazhfQcuJ/8ErGBnd3tOw9eZdF5rKxD5MYU 6fKW66FkDI48Vxf1c1EoeZmDOppnuawVObqOjCGuiDAoJllFf6NN9dHI1X0FK6VA6QvO XDRqUCCb9mgbOIWAgkqoulsWek/UMtOo44OJe1IZScLbU8PGCNHTZ2LZetgax0p+0Ezh NW2thIWsjrEPvjKFd5yZ+GAsvTSfEH8/leTl0Z8jeefH4LJKPqvfu729AHt7RCXzVbvn n7XI+gp8fAvh1xwcYKllBcqOBhfU9ES60CeB6GF2HKtUgmbLkNiPHKumnGAKU4P1devX sUTg== X-Gm-Message-State: AFeK/H2uq3b+F634dgj4X/DuFqkFmp3RYrWD6gbnQ2WYaBf33KwoS05qKDc11PQa+frcFw== X-Received: by 10.46.84.78 with SMTP id y14mr3681908ljd.63.1489702226125; Thu, 16 Mar 2017 15:10:26 -0700 (PDT) Received: from localhost.localdomain ([2001:470:28:852:10ad:e858:1f3b:5c2c]) by smtp.gmail.com with ESMTPSA id g3sm1124718lfe.34.2017.03.16.15.10.25 for (version=TLS1_2 cipher=ECDHE-RSA-AES128-SHA bits=128/128); Thu, 16 Mar 2017 15:10:25 -0700 (PDT) From: =?UTF-8?q?Martin=20Storsj=C3=B6?= To: ffmpeg-devel@ffmpeg.org Date: Fri, 17 Mar 2017 00:10:11 +0200 Message-Id: <1489702219-12643-6-git-send-email-martin@martin.st> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1489702219-12643-1-git-send-email-martin@martin.st> References: <1489702219-12643-1-git-send-email-martin@martin.st> Subject: [FFmpeg-devel] [PATCH 06/14] arm: vp9itxfm16: Avoid reloading the idct32 coefficients X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches MIME-Version: 1.0 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" Keep the idct32 coefficients in narrow form in q6-q7, and idct16 coefficients in lengthened 32 bit form in q0-q3. Avoid clobbering q0-q3 in the pass1 function, and squeeze the idct16 coefficients into q0-q1 in the pass2 function to avoid reloading them. The idct16 coefficients are clobbered and reloaded within idct32_odd though, since that turns out to be faster than narrowing them and swapping them into q6-q7. Before: Cortex A7 A8 A9 A53 vp9_inv_dct_dct_32x32_sub4_add_10_neon: 22653.8 18268.4 19598.0 14079.0 vp9_inv_dct_dct_32x32_sub32_add_10_neon: 37699.0 38665.2 32542.3 24472.2 After: vp9_inv_dct_dct_32x32_sub4_add_10_neon: 22270.8 18159.3 19531.0 13865.0 vp9_inv_dct_dct_32x32_sub32_add_10_neon: 37523.3 37731.6 32181.7 24071.2 --- libavcodec/arm/vp9itxfm_16bpp_neon.S | 128 +++++++++++++++++++---------------- 1 file changed, 69 insertions(+), 59 deletions(-) diff --git a/libavcodec/arm/vp9itxfm_16bpp_neon.S b/libavcodec/arm/vp9itxfm_16bpp_neon.S index 9c02ed9..29d95ca 100644 --- a/libavcodec/arm/vp9itxfm_16bpp_neon.S +++ b/libavcodec/arm/vp9itxfm_16bpp_neon.S @@ -1195,12 +1195,12 @@ endfunc .macro idct32_odd movrel r12, idct_coeffs - add r12, r12, #32 - vld1.16 {q0-q1}, [r12,:128] - vmovl.s16 q2, d2 - vmovl.s16 q3, d3 - vmovl.s16 q1, d1 - vmovl.s16 q0, d0 + + @ Overwrite the idct16 coeffs with the stored ones for idct32 + vmovl.s16 q0, d12 + vmovl.s16 q1, d13 + vmovl.s16 q2, d14 + vmovl.s16 q3, d15 mbutterfly d16, d31, d0[0], d0[1], q4, q5 @ d16 = t16a, d31 = t31a mbutterfly d24, d23, d1[0], d1[1], q4, q5 @ d24 = t17a, d23 = t30a @@ -1211,15 +1211,19 @@ endfunc mbutterfly d22, d25, d6[0], d6[1], q4, q5 @ d22 = t22a, d25 = t25a mbutterfly d30, d17, d7[0], d7[1], q4, q5 @ d30 = t23a, d17 = t24a - sub r12, r12, #32 - vld1.16 {q0}, [r12,:128] + @ Reload the idct16 coefficients. We could swap the coefficients between + @ q0-q3 and q6-q7 by narrowing/lengthening, but that's slower than just + @ loading and lengthening. + vld1.16 {q0-q1}, [r12,:128] + + butterfly d8, d24, d16, d24 @ d8 = t16, d24 = t17 + butterfly d9, d20, d28, d20 @ d9 = t19, d20 = t18 + butterfly d10, d26, d18, d26 @ d10 = t20, d26 = t21 + butterfly d11, d22, d30, d22 @ d11 = t23, d22 = t22 + vmovl.s16 q2, d2 + vmovl.s16 q3, d3 vmovl.s16 q1, d1 vmovl.s16 q0, d0 - - butterfly d4, d24, d16, d24 @ d4 = t16, d24 = t17 - butterfly d5, d20, d28, d20 @ d5 = t19, d20 = t18 - butterfly d6, d26, d18, d26 @ d6 = t20, d26 = t21 - butterfly d7, d22, d30, d22 @ d7 = t23, d22 = t22 butterfly d28, d25, d17, d25 @ d28 = t24, d25 = t25 butterfly d30, d21, d29, d21 @ d30 = t27, d21 = t26 butterfly d29, d23, d31, d23 @ d29 = t31, d23 = t30 @@ -1230,34 +1234,34 @@ endfunc mbutterfly d21, d26, d3[0], d3[1], q8, q9 @ d21 = t21a, d26 = t26a mbutterfly d25, d22, d3[0], d3[1], q8, q9, neg=1 @ d25 = t25a, d22 = t22a - butterfly d16, d5, d4, d5 @ d16 = t16a, d5 = t19a + butterfly d16, d9, d8, d9 @ d16 = t16a, d9 = t19a butterfly d17, d20, d23, d20 @ d17 = t17, d20 = t18 - butterfly d18, d6, d7, d6 @ d18 = t23a, d6 = t20a + butterfly d18, d10, d11, d10 @ d18 = t23a, d10 = t20a butterfly d19, d21, d22, d21 @ d19 = t22, d21 = t21 - butterfly d4, d28, d28, d30 @ d4 = t24a, d28 = t27a + butterfly d8, d28, d28, d30 @ d8 = t24a, d28 = t27a butterfly d23, d26, d25, d26 @ d23 = t25, d26 = t26 - butterfly d7, d29, d29, d31 @ d7 = t31a, d29 = t28a + butterfly d11, d29, d29, d31 @ d11 = t31a, d29 = t28a butterfly d22, d27, d24, d27 @ d22 = t30, d27 = t29 mbutterfly d27, d20, d1[0], d1[1], q12, q15 @ d27 = t18a, d20 = t29a - mbutterfly d29, d5, d1[0], d1[1], q12, q15 @ d29 = t19, d5 = t28 - mbutterfly d28, d6, d1[0], d1[1], q12, q15, neg=1 @ d28 = t27, d6 = t20 + mbutterfly d29, d9, d1[0], d1[1], q12, q15 @ d29 = t19, d9 = t28 + mbutterfly d28, d10, d1[0], d1[1], q12, q15, neg=1 @ d28 = t27, d10 = t20 mbutterfly d26, d21, d1[0], d1[1], q12, q15, neg=1 @ d26 = t26a, d21 = t21a - butterfly d31, d24, d7, d4 @ d31 = t31, d24 = t24 + butterfly d31, d24, d11, d8 @ d31 = t31, d24 = t24 butterfly d30, d25, d22, d23 @ d30 = t30a, d25 = t25a butterfly_r d23, d16, d16, d18 @ d23 = t23, d16 = t16 butterfly_r d22, d17, d17, d19 @ d22 = t22a, d17 = t17a butterfly d18, d21, d27, d21 @ d18 = t18, d21 = t21 - butterfly_r d27, d28, d5, d28 @ d27 = t27a, d28 = t28a - butterfly d4, d26, d20, d26 @ d4 = t29, d26 = t26 - butterfly d19, d20, d29, d6 @ d19 = t19a, d20 = t20 - vmov d29, d4 @ d29 = t29 - - mbutterfly0 d27, d20, d27, d20, d4, d6, q2, q3 @ d27 = t27, d20 = t20 - mbutterfly0 d26, d21, d26, d21, d4, d6, q2, q3 @ d26 = t26a, d21 = t21a - mbutterfly0 d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25, d22 = t22 - mbutterfly0 d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 = t23a + butterfly_r d27, d28, d9, d28 @ d27 = t27a, d28 = t28a + butterfly d8, d26, d20, d26 @ d8 = t29, d26 = t26 + butterfly d19, d20, d29, d10 @ d19 = t19a, d20 = t20 + vmov d29, d8 @ d29 = t29 + + mbutterfly0 d27, d20, d27, d20, d8, d10, q4, q5 @ d27 = t27, d20 = t20 + mbutterfly0 d26, d21, d26, d21, d8, d10, q4, q5 @ d26 = t26a, d21 = t21a + mbutterfly0 d25, d22, d25, d22, d8, d10, q4, q5 @ d25 = t25, d22 = t22 + mbutterfly0 d24, d23, d24, d23, d8, d10, q4, q5 @ d24 = t24a, d23 = t23a .endm @ Do an 32-point IDCT of a 2x32 slice out of a 32x32 matrix. @@ -1270,13 +1274,6 @@ endfunc @ r1 = unused @ r2 = src function idct32_1d_2x32_pass1_neon - movrel r12, idct_coeffs - vld1.16 {q0-q1}, [r12,:128] - vmovl.s16 q2, d2 - vmovl.s16 q3, d3 - vmovl.s16 q1, d1 - vmovl.s16 q0, d0 - @ Double stride of the input, since we only read every other line mov r12, #256 vmov.s32 d8, #0 @@ -1315,11 +1312,11 @@ function idct32_1d_2x32_pass1_neon sub r2, r2, r12, lsl #4 add r2, r2, #128 - vmov.s32 d4, #0 + vmov.s32 d8, #0 @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31) .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 vld1.16 {d\i}, [r2,:64] - vst1.16 {d4}, [r2,:64], r12 + vst1.16 {d8}, [r2,:64], r12 .endr idct32_odd @@ -1331,15 +1328,15 @@ function idct32_1d_2x32_pass1_neon @ from the output. .macro store_rev a, b, c, d, e, f, g, h .irp i, \a, \b, \c, \d, \e, \f, \g, \h - vld1.32 {d4}, [r0,:64] - vadd.s32 d4, d4, d\i - vst1.32 {d4}, [r0,:64]! + vld1.32 {d8}, [r0,:64] + vadd.s32 d8, d8, d\i + vst1.32 {d8}, [r0,:64]! vrev64.32 d\i, d\i .endr .irp i, \h, \g, \f, \e, \d, \c, \b, \a - vld1.32 {d4}, [r0,:64] - vsub.s32 d4, d4, d\i - vst1.32 {d4}, [r0,:64]! + vld1.32 {d8}, [r0,:64] + vsub.s32 d8, d8, d\i + vst1.32 {d8}, [r0,:64]! .endr .endm @@ -1357,13 +1354,6 @@ endfunc @ r1 = dst stride @ r2 = src (temp buffer) function idct32_1d_2x32_pass2_neon - movrel r12, idct_coeffs - vld1.16 {q0-q1}, [r12,:128] - vmovl.s16 q2, d2 - vmovl.s16 q3, d3 - vmovl.s16 q1, d1 - vmovl.s16 q0, d0 - mov r12, #256 @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30) .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 @@ -1389,6 +1379,13 @@ function idct32_1d_2x32_pass2_neon idct32_odd + @ Narrow the ict16 coefficients in q0-q3 into q0-q1, to + @ allow clobbering q2-q3 below. + vmovn.s32 d0, q0 + vmovn.s32 d1, q1 + vmovn.s32 d2, q2 + vmovn.s32 d3, q3 + mov r12, #256 vdup.s16 q4, r9 .macro load_acc_store a, b, c, d, neg=0 @@ -1409,15 +1406,15 @@ function idct32_1d_2x32_pass2_neon vsub.s32 d6, d6, d\c vsub.s32 d7, d7, d\d .endif - vld1.32 {d2[]}, [r0,:32], r1 - vld1.32 {d2[1]}, [r0,:32], r1 + vld1.32 {d10[]}, [r0,:32], r1 + vld1.32 {d10[1]}, [r0,:32], r1 vrshr.s32 q2, q2, #6 - vld1.32 {d3[]}, [r0,:32], r1 + vld1.32 {d11[]}, [r0,:32], r1 vrshr.s32 q3, q3, #6 - vld1.32 {d3[1]}, [r0,:32], r1 + vld1.32 {d11[1]}, [r0,:32], r1 sub r0, r0, r1, lsl #2 - vaddw.u16 q2, q2, d2 - vaddw.u16 q3, q3, d3 + vaddw.u16 q2, q2, d10 + vaddw.u16 q3, q3, d11 vqmovun.s32 d4, q2 vqmovun.s32 d5, q3 vmin.u16 q2, q2, q4 @@ -1437,6 +1434,11 @@ function idct32_1d_2x32_pass2_neon load_acc_store 24, 25, 26, 27, 1 load_acc_store 28, 29, 30, 31, 1 .purgem load_acc_store + @ Lengthen the idct16 coeffs back into 32 bit form + vmovl.s16 q2, d2 + vmovl.s16 q3, d3 + vmovl.s16 q1, d1 + vmovl.s16 q0, d0 bx lr endfunc @@ -1447,7 +1449,7 @@ endconst function vp9_idct_idct_32x32_add_16_neon cmp r3, #1 beq idct32x32_dc_add_neon - vpush {q4-q5} + vpush {q4-q7} movrel r8, min_eob_idct_idct_32 + 2 @ Align the stack, allocate a temp buffer @@ -1461,6 +1463,14 @@ A and r7, sp, #15 mov r5, r1 mov r6, r2 + movrel r12, idct_coeffs + vld1.16 {q0-q1}, [r12,:128]! + vld1.16 {q6-q7}, [r12,:128] + vmovl.s16 q2, d2 + vmovl.s16 q3, d3 + vmovl.s16 q1, d1 + vmovl.s16 q0, d0 + .irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 add r0, sp, #(\i*128) .if \i > 0 @@ -1498,7 +1508,7 @@ A and r7, sp, #15 .endr add sp, sp, r7 - vpop {q4-q5} + vpop {q4-q7} pop {r4-r9,pc} endfunc