[FFmpeg-devel,30/34] aarch64: vp9itxfm: Avoid reloading the idct32 coefficients

Message ID	1488967274-8143-30-git-send-email-martin@martin.st
State	Accepted
Commit	65aa002d54433154a6924dc13e498bec98451ad0
Headers	show Delivered-To: ffmpegpatchwork@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st> To: ffmpeg-devel@ffmpeg.org Date: Wed, 8 Mar 2017 12:01:10 +0200 Message-Id: <1488967274-8143-30-git-send-email-martin@martin.st> In-Reply-To: <1488967274-8143-1-git-send-email-martin@martin.st> References: <1488967274-8143-1-git-send-email-martin@martin.st> Subject: [FFmpeg-devel] [PATCH 30/34] aarch64: vp9itxfm: Avoid reloading the idct32 coefficients Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S index be65eb7..dd9fde1 100644 --- a/libavcodec/aarch64/vp9itxfm_neon.S +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -1123,18 +1123,14 @@ endfunc .endm function idct32_odd - ld1 {v0.8h,v1.8h}, [x11] - - dmbutterfly v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a - dmbutterfly v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a - dmbutterfly v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a - dmbutterfly v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a - dmbutterfly v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a - dmbutterfly v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a - dmbutterfly v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a - dmbutterfly v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a - - ld1 {v0.8h}, [x10] + dmbutterfly v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a + dmbutterfly v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a + dmbutterfly v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a + dmbutterfly v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a + dmbutterfly v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a + dmbutterfly v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a + dmbutterfly v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a + dmbutterfly v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17 butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18 @@ -1153,18 +1149,14 @@ function idct32_odd endfunc function idct32_odd_half - ld1 {v0.8h,v1.8h}, [x11] - - dmbutterfly_h1 v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a - dmbutterfly_h2 v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a - dmbutterfly_h1 v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a - dmbutterfly_h2 v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a - dmbutterfly_h1 v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a - dmbutterfly_h2 v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a - dmbutterfly_h1 v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a - dmbutterfly_h2 v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a - - ld1 {v0.8h}, [x10] + dmbutterfly_h1 v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a + dmbutterfly_h2 v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a + dmbutterfly_h1 v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a + dmbutterfly_h2 v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a + dmbutterfly_h1 v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a + dmbutterfly_h2 v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a + dmbutterfly_h1 v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a + dmbutterfly_h2 v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17 butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18 @@ -1183,18 +1175,14 @@ function idct32_odd_half endfunc function idct32_odd_quarter - ld1 {v0.8h,v1.8h}, [x11] - - dsmull_h v4, v5, v16, v0.h[0] - dsmull_h v28, v29, v19, v0.h[7] - dsmull_h v30, v31, v16, v0.h[1] - dsmull_h v22, v23, v17, v1.h[6] - dsmull_h v7, v6, v17, v1.h[7] - dsmull_h v26, v27, v19, v0.h[6] - dsmull_h v20, v21, v18, v1.h[0] - dsmull_h v24, v25, v18, v1.h[1] - - ld1 {v0.8h}, [x10] + dsmull_h v4, v5, v16, v8.h[0] + dsmull_h v28, v29, v19, v8.h[7] + dsmull_h v30, v31, v16, v8.h[1] + dsmull_h v22, v23, v17, v9.h[6] + dsmull_h v7, v6, v17, v9.h[7] + dsmull_h v26, v27, v19, v8.h[6] + dsmull_h v20, v21, v18, v9.h[0] + dsmull_h v24, v25, v18, v9.h[1] neg v28.4s, v28.4s neg v29.4s, v29.4s @@ -1240,12 +1228,8 @@ endfunc // x1 = unused // x2 = src // x9 = double input stride -// x10 = idct_coeffs -// x11 = idct_coeffs + 32 function idct32_1d_8x32_pass1\suffix\()_neon mov x14, x30 - ld1 {v0.8h,v1.8h}, [x10] - movi v2.8h, #0 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) @@ -1278,14 +1262,14 @@ function idct32_1d_8x32_pass1\suffix\()_neon .macro store_rev a, b // There's no rev128 instruction, but we reverse each 64 bit // half, and then flip them using an ext with 8 bytes offset. - rev64 v1.8h, \b + rev64 v3.8h, \b st1 {\a}, [x0], #16 - rev64 v0.8h, \a - ext v1.16b, v1.16b, v1.16b, #8 + rev64 v2.8h, \a + ext v3.16b, v3.16b, v3.16b, #8 st1 {\b}, [x0], #16 - ext v0.16b, v0.16b, v0.16b, #8 - st1 {v1.8h}, [x0], #16 - st1 {v0.8h}, [x0], #16 + ext v2.16b, v2.16b, v2.16b, #8 + st1 {v3.8h}, [x0], #16 + st1 {v2.8h}, [x0], #16 .endm store_rev v16.8h, v24.8h store_rev v17.8h, v25.8h @@ -1339,20 +1323,20 @@ function idct32_1d_8x32_pass1\suffix\()_neon // subtracted from the output. .macro store_rev a, b ld1 {v4.8h}, [x0] - rev64 v1.8h, \b + rev64 v3.8h, \b add v4.8h, v4.8h, \a - rev64 v0.8h, \a + rev64 v2.8h, \a st1 {v4.8h}, [x0], #16 - ext v1.16b, v1.16b, v1.16b, #8 + ext v3.16b, v3.16b, v3.16b, #8 ld1 {v5.8h}, [x0] - ext v0.16b, v0.16b, v0.16b, #8 + ext v2.16b, v2.16b, v2.16b, #8 add v5.8h, v5.8h, \b st1 {v5.8h}, [x0], #16 ld1 {v6.8h}, [x0] - sub v6.8h, v6.8h, v1.8h + sub v6.8h, v6.8h, v3.8h st1 {v6.8h}, [x0], #16 ld1 {v7.8h}, [x0] - sub v7.8h, v7.8h, v0.8h + sub v7.8h, v7.8h, v2.8h st1 {v7.8h}, [x0], #16 .endm @@ -1376,12 +1360,8 @@ endfunc // x2 = src (temp buffer) // x7 = negative double temp buffer stride // x9 = double temp buffer stride -// x10 = idct_coeffs -// x11 = idct_coeffs + 32 function idct32_1d_8x32_pass2\suffix\()_neon mov x14, x30 - ld1 {v0.8h,v1.8h}, [x10] - // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) .ifb \suffix .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 @@ -1454,15 +1434,15 @@ function idct32_1d_8x32_pass2\suffix\()_neon sub v6.8h, v6.8h, \c sub v7.8h, v7.8h, \d .endif - ld1 {v0.8b}, [x0], x1 - ld1 {v1.8b}, [x0], x1 + ld1 {v10.8b}, [x0], x1 + ld1 {v11.8b}, [x0], x1 srshr v4.8h, v4.8h, #6 ld1 {v2.8b}, [x0], x1 srshr v5.8h, v5.8h, #6 - uaddw v4.8h, v4.8h, v0.8b + uaddw v4.8h, v4.8h, v10.8b ld1 {v3.8b}, [x0], x1 srshr v6.8h, v6.8h, #6 - uaddw v5.8h, v5.8h, v1.8b + uaddw v5.8h, v5.8h, v11.8b srshr v7.8h, v7.8h, #6 sub x0, x0, x1, lsl #2 uaddw v6.8h, v6.8h, v2.8b @@ -1503,13 +1483,10 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1 b.eq idct32x32_dc_add_neon movrel x10, idct_coeffs - add x11, x10, #32 movrel x12, min_eob_idct_idct_32, 2 mov x15, x30 - stp d14, d15, [sp, #-0x10]! - stp d12, d13, [sp, #-0x10]! stp d10, d11, [sp, #-0x10]! stp d8, d9, [sp, #-0x10]! @@ -1523,6 +1500,9 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1 mov x9, #128 neg x7, x9 + ld1 {v0.8h,v1.8h}, [x10], #32 + ld1 {v8.8h,v9.8h}, [x10] + cmp w3, #34 b.le idct32x32_quarter_add_neon cmp w3, #135 @@ -1565,8 +1545,6 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1 ldp d8, d9, [sp], 0x10 ldp d10, d11, [sp], 0x10 - ldp d12, d13, [sp], 0x10 - ldp d14, d15, [sp], 0x10 br x15 endfunc @@ -1592,8 +1570,6 @@ function idct32x32_\size\()_add_neon ldp d8, d9, [sp], 0x10 ldp d10, d11, [sp], 0x10 - ldp d12, d13, [sp], 0x10 - ldp d14, d15, [sp], 0x10 br x15 endfunc

[FFmpeg-devel,30/34] aarch64: vp9itxfm: Avoid reloading the idct32 coefficients

Commit Message

Patch