From patchwork Sun Mar 12 22:06:26 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Ilia X-Patchwork-Id: 2905 Delivered-To: ffmpegpatchwork@gmail.com Received: by 10.103.50.79 with SMTP id y76csp923075vsy; Sun, 12 Mar 2017 14:12:15 -0700 (PDT) X-Received: by 10.28.133.203 with SMTP id h194mr7333706wmd.122.1489353135886; Sun, 12 Mar 2017 14:12:15 -0700 (PDT) Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100]) by mx.google.com with ESMTP id r10si8263890wmg.86.2017.03.12.14.12.15; Sun, 12 Mar 2017 14:12:15 -0700 (PDT) Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Authentication-Results: mx.google.com; dkim=neutral (body hash did not verify) header.i=@gmail.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org; dmarc=fail (p=NONE sp=NONE dis=NONE) header.from=gmail.com Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id D1E97689CD1; Sun, 12 Mar 2017 23:11:57 +0200 (EET) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from mail-lf0-f66.google.com (mail-lf0-f66.google.com [209.85.215.66]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 09F9A689BE4 for ; Sun, 12 Mar 2017 23:11:52 +0200 (EET) Received: by mail-lf0-f66.google.com with SMTP id y193so10537953lfd.1 for ; Sun, 12 Mar 2017 14:12:07 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20161025; h=from:to:cc:subject:date:message-id; bh=+wllTBl2WdxUEAoChLJHpur5OhurPLzLpWCfLUJC4N0=; b=C6aRLQLyFoDs4+hPnUCG7JR+tl4Cg3MnmdB/EvKD/0EtwLXWByvyFysxpBzm+51i8B bkdVuJbZpUYU//3K/7BXCpueIX5eYaCJKACFlPUSMGZqsk0VazYwSDoI1CCKQ8xvxNdt d+RZVzd7Dn6O+l5RX3ZCx9k9lVA/JOY6N3XF9jlJKQd62zxe+OTRRIv6T7Bd5rJrYgW7 qBMzevLEBFuaX3cyftkhW7w17KjyjG7ulC5fbFPy+NauRXAdgGMLUgreO0UgM0nuIEqZ 6lBz3qz865t6Dzj1rffwoagcACYY/FZYxb3j0FLrzw8IzIYZp8taa4er7W7oBfFBlHm8 4inQ== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:cc:subject:date:message-id; bh=+wllTBl2WdxUEAoChLJHpur5OhurPLzLpWCfLUJC4N0=; b=Vr/97jCodves75J81v6zC2+m6vcE6ymEEdWrGVUlf+DzqxuIKsxjjUff7wKAJpdlMS uSbrBAqCgwHYfnFSt/smkaX7eeTQWqCEBq5G0Qwcn93Gr16NVPUeAa3uY7CTTqCC+Y3x yzRJZt6EkVLZerVcfM2jG2tYEqL6E5PbU0wQhAJYQRmLOUFkHcIMG+cZFrjrkm76ZVHM QxlStZdm3Ad5FoWT7NcAO2CNzYiUUX7LpMCGIgLbOKx9Kh7B4ibcgX+eieAiFp38m/PU LigZQsCC95eWfBS1Q4MgZ5VF+0Q4tlZZ9kxxuRQkzCzUShRUmFrfyKrCqIVVxSffXpgo bmww== X-Gm-Message-State: AMke39kjhWnGDwO88PWbXyFhdRI5DCDVns703i+ukW8SSX0nw3nHh3H35dOgfdcdjjdLRw== X-Received: by 10.25.17.98 with SMTP id g95mr8109171lfi.69.1489352814507; Sun, 12 Mar 2017 14:06:54 -0700 (PDT) Received: from localhost.localdomain ([95.191.222.231]) by smtp.gmail.com with ESMTPSA id h140sm3211129lfg.39.2017.03.12.14.06.53 (version=TLS1_2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128); Sun, 12 Mar 2017 14:06:53 -0700 (PDT) From: Ilia To: ffmpeg-devel@ffmpeg.org Date: Mon, 13 Mar 2017 05:06:26 +0700 Message-Id: <20170312220626.7164-1-zakne0ne@gmail.com> X-Mailer: git-send-email 2.8.3 Subject: [FFmpeg-devel] [PATCH] avcodec/vp9: avx2 implementation of ipred_dl_16x16_16 X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: Ilia MIME-Version: 1.0 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" vp9_diag_downleft_16x16_10bpp_c: 263.0 vp9_diag_downleft_16x16_10bpp_sse2: 44.7 vp9_diag_downleft_16x16_10bpp_ssse3: 32.5 vp9_diag_downleft_16x16_10bpp_avx: 31.9 vp9_diag_downleft_16x16_10bpp_avx2: 25.7 vp9_diag_downleft_16x16_12bpp_c: 264.7 vp9_diag_downleft_16x16_12bpp_sse2: 44.4 vp9_diag_downleft_16x16_12bpp_ssse3: 32.0 vp9_diag_downleft_16x16_12bpp_avx: 32.4 vp9_diag_downleft_16x16_12bpp_avx2: 25.5 Benchmarked with 10000 runs Signed-off-by: Ilia --- libavcodec/x86/vp9dsp_init_16bpp.c | 2 ++ libavcodec/x86/vp9intrapred_16bpp.asm | 39 +++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c index eb67499..4576ff1 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp.c +++ b/libavcodec/x86/vp9dsp_init_16bpp.c @@ -51,6 +51,7 @@ decl_ipred_fns(h, 16, mmxext, sse2); decl_ipred_fns(dc, 16, mmxext, sse2); decl_ipred_fns(dc_top, 16, mmxext, sse2); decl_ipred_fns(dc_left, 16, mmxext, sse2); +decl_ipred_fn(dl, 16, 16, avx2); #define decl_ipred_dir_funcs(type) \ decl_ipred_fns(type, 16, sse2, sse2); \ @@ -133,6 +134,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) init_fpel_func(2, 1, 32, avg, _16, avx2); init_fpel_func(1, 1, 64, avg, _16, avx2); init_fpel_func(0, 1, 128, avg, _16, avx2); + init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2); } #endif /* HAVE_YASM */ diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm index c0ac16d..212e413 100644 --- a/libavcodec/x86/vp9intrapred_16bpp.asm +++ b/libavcodec/x86/vp9intrapred_16bpp.asm @@ -847,6 +847,45 @@ DL_FUNCS INIT_XMM avx DL_FUNCS +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] ; abcdefghijklmnop + vpbroadcastw xm1, [aq+30] ; pppppppp + vperm2i128 m2, m0, m1, q0201 ; ijklmnoppppppppp + vpalignr m3, m2, m0, 2 ; bcdefghijklmnopp + vpalignr m4, m2, m0, 4 ; cdefghijklmnoppp + LOWPASS 0, 3, 4 ; BCDEFGHIJKLMNOPp + vperm2i128 m2, m0, m1, q0201 ; JKLMNOPppppppppp + DEFINE_ARGS dst, stride, stride3, cnt + mov cntd, 2 + lea stride3q, [strideq*3] +.loop: + mova [dstq+strideq*0], m0 + vpalignr m3, m2, m0, 2 + vpalignr m4, m2, m0, 4 + mova [dstq+strideq*1], m3 + mova [dstq+strideq*2], m4 + vpalignr m3, m2, m0, 6 + vpalignr m4, m2, m0, 8 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m4 + vpalignr m3, m2, m0, 10 + vpalignr m4, m2, m0, 12 + mova [dstq+strideq*1], m3 + mova [dstq+strideq*2], m4 + vpalignr m3, m2, m0, 14 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + mova m0, m2 + vperm2i128 m2, m2, m2, q0101 ; pppppppppppppppp + dec cntd + jg .loop + RET +%endif + %macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a movh m0, [lq] ; wxyz....