From patchwork Mon Jan 9 22:15:16 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 2153 Delivered-To: ffmpegpatchwork@gmail.com Received: by 10.103.89.21 with SMTP id n21csp7257233vsb; Mon, 9 Jan 2017 14:20:47 -0800 (PST) X-Received: by 10.28.66.194 with SMTP id k63mr6185228wmi.140.1484000447205; Mon, 09 Jan 2017 14:20:47 -0800 (PST) Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100]) by mx.google.com with ESMTP id t10si198950wmb.0.2017.01.09.14.20.46; Mon, 09 Jan 2017 14:20:47 -0800 (PST) Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Authentication-Results: mx.google.com; dkim=neutral (body hash did not verify) header.i=@martin-st.20150623.gappssmtp.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id BEC5C68A220; Tue, 10 Jan 2017 00:20:36 +0200 (EET) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from mail-lf0-f66.google.com (mail-lf0-f66.google.com [209.85.215.66]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 948F768077C for ; Tue, 10 Jan 2017 00:20:30 +0200 (EET) Received: by mail-lf0-f66.google.com with SMTP id j75so13009079lfe.3 for ; Mon, 09 Jan 2017 14:20:37 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=martin-st.20150623.gappssmtp.com; s=20150623; h=from:to:subject:date:message-id:in-reply-to:references; bh=gKFD6dsuMEtJhled2UDTJWLRzqabZDczeMes2T6uAZA=; b=E4IAb5cESvN6YURJ5KLT9m3GxSLatRlrfyTlGXl40gEty97SgCYnYuKIKrsZeb4jqZ qe3IjcEaaqkrEcaUhyj0UybpfsgIayvMyV7Axg1SV8NwW3mNwTatKfU6SFkvd2WDpIy5 x8jMX1birB0xahewleLeVtCFZJi6ha/Yf7HaYbghmHnxGI5FZ6h5Y7Pqnh4lnSIoJ2qP mG3osYaoAWaKGVpNmSTkW72yU76FChNJBOYhHJL39TNV6zUkT5SCxQW8VC85emG0kNQp M4zITi/oV4EXMMPdBXR8Dgx9kMNFhxbdZ7CmWIbp181Wyc0aGOlrVDRDSd8f+Wc/cOiq koZA== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:subject:date:message-id:in-reply-to :references; bh=gKFD6dsuMEtJhled2UDTJWLRzqabZDczeMes2T6uAZA=; b=qgmMPwWjVm1Tu4E+snduF9nOmfxc3Y3ga92oRCJ9kuB9MjrbBPz3kaVwybyipnpg8b TClVymXD1iUP3NoTe/rCe5Hsel/hhRTW2w5oh6KJWeUeIkFwWmqTyIejyO1sjUFsl4dc WPVk3/hlpoI6IdSlXGN9nOZ4e6EGII2GFVhfxnK6DZ9v0SU63fCjEn/bsAc9IgfqakKf wG9y3Sa3L82V0CBNtFV9msNIaP/PHlvYQanbkRxr/7swPrsYXUAgDWHLa7ypal5XzJMi 57wSfp+rU96DRVm9UykC8iKMqRrToVDMpNOyheQMPvVma6U5fJfPbk8VHPWl8c02z/xo lYNA== X-Gm-Message-State: AIkVDXL3Y+M+ccr6pbbZ218XG4tSO9wyGexMt4/v/ipkb0bFdUTKAzvJPguexndKv9EmVA== X-Received: by 10.25.56.22 with SMTP id f22mr28178434lfa.0.1484000129060; Mon, 09 Jan 2017 14:15:29 -0800 (PST) Received: from localhost.localdomain ([2001:470:28:852:a9ed:5432:636c:1053]) by smtp.gmail.com with ESMTPSA id f25sm1358538lji.26.2017.01.09.14.15.28 for (version=TLS1_2 cipher=ECDHE-RSA-AES128-SHA bits=128/128); Mon, 09 Jan 2017 14:15:28 -0800 (PST) From: =?UTF-8?q?Martin=20Storsj=C3=B6?= To: ffmpeg-devel@ffmpeg.org Date: Tue, 10 Jan 2017 00:15:16 +0200 Message-Id: <1484000119-4959-10-git-send-email-martin@martin.st> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1484000119-4959-1-git-send-email-martin@martin.st> References: <1484000119-4959-1-git-send-email-martin@martin.st> Subject: [FFmpeg-devel] [PATCH 10/13] aarch64: vp9itxfm: Skip empty slices in the first pass of idct_idct 16x16 and 32x32 X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches MIME-Version: 1.0 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" This work is sponsored by, and copyright, Google. Previously all subpartitions except the eob=1 (DC) case ran with the same runtime: vp9_inv_dct_dct_16x16_sub16_add_neon: 1373.2 vp9_inv_dct_dct_32x32_sub32_add_neon: 8089.0 By skipping individual 8x16 or 8x32 pixel slices in the first pass, we reduce the runtime of these functions like this: vp9_inv_dct_dct_16x16_sub1_add_neon: 235.3 vp9_inv_dct_dct_16x16_sub2_add_neon: 1036.7 vp9_inv_dct_dct_16x16_sub4_add_neon: 1036.7 vp9_inv_dct_dct_16x16_sub8_add_neon: 1036.7 vp9_inv_dct_dct_16x16_sub12_add_neon: 1372.1 vp9_inv_dct_dct_16x16_sub16_add_neon: 1372.1 vp9_inv_dct_dct_32x32_sub1_add_neon: 555.1 vp9_inv_dct_dct_32x32_sub2_add_neon: 5190.2 vp9_inv_dct_dct_32x32_sub4_add_neon: 5180.0 vp9_inv_dct_dct_32x32_sub8_add_neon: 5183.1 vp9_inv_dct_dct_32x32_sub12_add_neon: 6161.5 vp9_inv_dct_dct_32x32_sub16_add_neon: 6155.5 vp9_inv_dct_dct_32x32_sub20_add_neon: 7136.3 vp9_inv_dct_dct_32x32_sub24_add_neon: 7128.4 vp9_inv_dct_dct_32x32_sub28_add_neon: 8098.9 vp9_inv_dct_dct_32x32_sub32_add_neon: 8098.8 I.e. in general a very minor overhead for the full subpartition case due to the additional cmps, but a significant speedup for the cases when we only need to process a small part of the actual input data. This is cherrypicked from libav commits cad42fadcd2c2ae1b3676bb398844a1f521a2d7b and a0c443a3980dc22eb02b067ac4cb9ffa2f9b04d2. --- libavcodec/aarch64/vp9itxfm_neon.S | 61 ++++++++++++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 5 deletions(-) diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S index e5fc612..82f1f41 100644 --- a/libavcodec/aarch64/vp9itxfm_neon.S +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -588,6 +588,9 @@ endfunc .macro store i, dst, inc st1 {v\i\().8h}, [\dst], \inc .endm +.macro movi_v i, size, imm + movi v\i\()\size, \imm +.endm .macro load_clear i, src, inc ld1 {v\i\().8h}, [\src] st1 {v2.8h}, [\src], \inc @@ -596,9 +599,8 @@ endfunc // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it, // transpose into a horizontal 16x8 slice and store. // x0 = dst (temp buffer) -// x1 = unused +// x1 = slice offset // x2 = src -// x3 = slice offset // x9 = input stride .macro itxfm16_1d_funcs txfm function \txfm\()16_1d_8x16_pass1_neon @@ -616,14 +618,14 @@ function \txfm\()16_1d_8x16_pass1_neon transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 // Store the transposed 8x8 blocks horizontally. - cmp x3, #8 + cmp x1, #8 b.eq 1f .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 store \i, x0, #16 .endr ret 1: - // Special case: For the last input column (x3 == 8), + // Special case: For the last input column (x1 == 8), // which would be stored as the last row in the temp buffer, // don't store the first 8x8 block, but keep it in registers // for the first slice of the second pass (where it is the @@ -751,13 +753,36 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 .irp i, 0, 8 add x0, sp, #(\i*32) +.ifc \txfm1\()_\txfm2,idct_idct +.if \i == 8 + cmp w3, #38 + b.le 1f +.endif +.endif + mov x1, #\i add x2, x6, #(\i*2) - mov x3, #\i bl \txfm1\()16_1d_8x16_pass1_neon .endr .ifc \txfm1\()_\txfm2,iadst_idct ld1 {v0.8h,v1.8h}, [x10] .endif + +.ifc \txfm1\()_\txfm2,idct_idct + b 3f +1: + // Set v24-v31 to zero, for the in-register passthrough of + // coefficients to pass 2. Since we only do two slices, this can + // only ever happen for the second slice. So we only need to store + // zeros to the temp buffer for the second half of the buffer. + // Move x0 to the second half, and use x9 == 32 as increment. + add x0, x0, #16 +.irp i, 24, 25, 26, 27, 28, 29, 30, 31 + movi_v \i, .16b, #0 + st1 {v24.8h}, [x0], x9 +.endr +3: +.endif + .irp i, 0, 8 add x0, x4, #(\i) mov x1, x5 @@ -1073,12 +1098,17 @@ function idct32_1d_8x32_pass2_neon ret endfunc +const min_eob_idct_idct_32, align=4 + .short 0, 34, 135, 336 +endconst + function ff_vp9_idct_idct_32x32_add_neon, export=1 cmp w3, #1 b.eq idct32x32_dc_add_neon movrel x10, idct_coeffs add x11, x10, #32 + movrel x12, min_eob_idct_idct_32, 2 mov x15, x30 @@ -1099,9 +1129,30 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1 .irp i, 0, 8, 16, 24 add x0, sp, #(\i*64) +.if \i > 0 + ldrh w1, [x12], #2 + cmp w3, w1 + mov x1, #(32 - \i)/4 + b.le 1f +.endif add x2, x6, #(\i*2) bl idct32_1d_8x32_pass1_neon .endr + b 3f + +1: + // Write zeros to the temp buffer for pass 2 + movi v16.8h, #0 + movi v17.8h, #0 + movi v18.8h, #0 + movi v19.8h, #0 +2: + subs x1, x1, #1 +.rept 4 + st1 {v16.8h-v19.8h}, [x0], #64 +.endr + b.ne 2b +3: .irp i, 0, 8, 16, 24 add x0, x4, #(\i) mov x1, x5