From patchwork Thu Mar 16 22:10:06 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 2961 Delivered-To: ffmpegpatchwork@gmail.com Received: by 10.103.50.79 with SMTP id y76csp1186vsy; Thu, 16 Mar 2017 15:10:32 -0700 (PDT) X-Received: by 10.223.138.250 with SMTP id z55mr10928701wrz.130.1489702232249; Thu, 16 Mar 2017 15:10:32 -0700 (PDT) Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100]) by mx.google.com with ESMTP id r59si8362490wrb.259.2017.03.16.15.10.31; Thu, 16 Mar 2017 15:10:32 -0700 (PDT) Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Authentication-Results: mx.google.com; dkim=neutral (body hash did not verify) header.i=@martin-st.20150623.gappssmtp.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 03A8D688290; Fri, 17 Mar 2017 00:10:12 +0200 (EET) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from mail-lf0-f67.google.com (mail-lf0-f67.google.com [209.85.215.67]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 4DEFB688261 for ; Fri, 17 Mar 2017 00:10:05 +0200 (EET) Received: by mail-lf0-f67.google.com with SMTP id g70so4296051lfh.3 for ; Thu, 16 Mar 2017 15:10:22 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=martin-st.20150623.gappssmtp.com; s=20150623; h=from:to:subject:date:message-id; bh=BwylnCAa9F0rQtXqFezPrQgZdZg0uLYdGwAoVkssUtU=; b=UA/LVSo3lRXb5C5bbhtTJJAGlsJCMduqWD9G6vO3RSwZmCGCrG9IJSIya60oFzzTE/ vazKjhWsxqTqPioihThHalWhkkAUTuhm8rjdL/8P6ySX+fLXlRzyUtVsunMLRRXw3OjE okMYE7EPJ+5qBkk7kq1ft0FIrISAZq2LTYC7LaibNsk+fYsrzhM91ADwvYFSwFb/8qIn DdMGzem9WrkO6O6pf3wHIwjY9ZaxkCmkn5pOyUhYE369P3nj6LFW9vHG7JAUjazvmWqZ B8Dvbm+irg2ffhblyWnbSSarDtsC/3InkrP2lCj09HF/MuZN3wTDzStrQXUFYjhzDlQ3 a2ZA== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:subject:date:message-id; bh=BwylnCAa9F0rQtXqFezPrQgZdZg0uLYdGwAoVkssUtU=; b=iJ9ymMp0gHA/Y+4CaIr9rUIF2U35bdQLmjbHyhbUdDeJbnnMYYyGL1i27lG490+jjV LQr/plb7/mOwdaU0H1cdDKBLLVh/ggXEEawN/qHOwkV4Vgu8Ey+M4+jv3zKT5isolIy3 i72HjbT4iFMv0HKw//w7x1mnI79qCOXqh8iKTWFz+xCQ2X/LqRHys/TvrG6eE7NkSQwv qaVgjxiu3gjG6wexkinTo9yGTWOVR0KbzEKjBDfEBu6gmKS6Bdte2Lxb1n3ptYZKJ7P8 ZvQmTqGyYYpmIMReADZ1BWX0iWWWrwKBWt5Fg5n9/Lf7F7/lfxMAr9wmeHlGOvrXJMGO 3HRA== X-Gm-Message-State: AFeK/H3UKJh160BJZD6J46cJMzHxzivxT29gUlimETFfwFo/gxGtvYV+OCfT0cuQiJqiDA== X-Received: by 10.25.161.81 with SMTP id k78mr2617974lfe.111.1489702222053; Thu, 16 Mar 2017 15:10:22 -0700 (PDT) Received: from localhost.localdomain ([2001:470:28:852:10ad:e858:1f3b:5c2c]) by smtp.gmail.com with ESMTPSA id g3sm1124718lfe.34.2017.03.16.15.10.21 for (version=TLS1_2 cipher=ECDHE-RSA-AES128-SHA bits=128/128); Thu, 16 Mar 2017 15:10:21 -0700 (PDT) From: =?UTF-8?q?Martin=20Storsj=C3=B6?= To: ffmpeg-devel@ffmpeg.org Date: Fri, 17 Mar 2017 00:10:06 +0200 Message-Id: <1489702219-12643-1-git-send-email-martin@martin.st> X-Mailer: git-send-email 2.7.4 Subject: [FFmpeg-devel] [PATCH 01/14] arm: vp9itxfm: Template the quarter/half idct32 function X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches MIME-Version: 1.0 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" This reduces the number of lines and reduces the duplication. Also simplify the eob check for the half case. If we are in the half case, we know we at least will need to do the first three slices, we only need to check eob for the fourth one, so we can hardcode the value to check against instead of loading from the min_eob array. Since at most one slice can be skipped in the first pass, we can unroll the loop for filling zeros completely, as it was done for the quarter case before. This allows skipping loading the min_eob pointer when using the quarter/half cases. This is cherrypicked from libav commit 98ee855ae0cc118bd1d20921d6bdb14731832462. --- libavcodec/arm/vp9itxfm_neon.S | 57 +++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 37 deletions(-) diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S index ebbbda9..adc9896 100644 --- a/libavcodec/arm/vp9itxfm_neon.S +++ b/libavcodec/arm/vp9itxfm_neon.S @@ -1575,7 +1575,6 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1 beq idct32x32_dc_add_neon push {r4-r8,lr} vpush {q4-q6} - movrel r8, min_eob_idct_idct_32 + 2 @ Align the stack, allocate a temp buffer T mov r7, sp @@ -1597,6 +1596,8 @@ A and r7, sp, #15 cmp r3, #135 ble idct32x32_half_add_neon + movrel r8, min_eob_idct_idct_32 + 2 + .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r0, sp, #(\i*64) .if \i > 0 @@ -1634,72 +1635,54 @@ A and r7, sp, #15 pop {r4-r8,pc} endfunc -function idct32x32_quarter_add_neon +.macro idct32_partial size +function idct32x32_\size\()_add_neon .irp i, 0, 4 add r0, sp, #(\i*64) +.ifc \size,quarter .if \i == 4 cmp r3, #9 ble 1f .endif +.endif add r2, r6, #(\i*2) - bl idct32_1d_4x32_pass1_quarter_neon -.endr - b 3f - -1: - @ Write zeros to the temp buffer for pass 2 - vmov.i16 q14, #0 - vmov.i16 q15, #0 -.rept 8 - vst1.16 {q14-q15}, [r0,:128]! -.endr -3: -.irp i, 0, 4, 8, 12, 16, 20, 24, 28 - add r0, r4, #(\i) - mov r1, r5 - add r2, sp, #(\i*2) - bl idct32_1d_4x32_pass2_quarter_neon + bl idct32_1d_4x32_pass1_\size\()_neon .endr - add sp, sp, r7 - vpop {q4-q6} - pop {r4-r8,pc} -endfunc - -function idct32x32_half_add_neon -.irp i, 0, 4, 8, 12 +.ifc \size,half +.irp i, 8, 12 add r0, sp, #(\i*64) -.if \i > 0 - ldrh_post r1, r8, #2 - cmp r3, r1 - it le - movle r1, #(16 - \i)/2 +.if \i == 12 + cmp r3, #70 ble 1f .endif add r2, r6, #(\i*2) - bl idct32_1d_4x32_pass1_half_neon + bl idct32_1d_4x32_pass1_\size\()_neon .endr +.endif b 3f 1: @ Write zeros to the temp buffer for pass 2 vmov.i16 q14, #0 vmov.i16 q15, #0 -2: - subs r1, r1, #1 -.rept 4 +.rept 8 vst1.16 {q14-q15}, [r0,:128]! .endr - bne 2b + 3: .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r0, r4, #(\i) mov r1, r5 add r2, sp, #(\i*2) - bl idct32_1d_4x32_pass2_half_neon + bl idct32_1d_4x32_pass2_\size\()_neon .endr add sp, sp, r7 vpop {q4-q6} pop {r4-r8,pc} endfunc +.endm + +idct32_partial quarter +idct32_partial half