Message ID | 20170312220626.7164-1-zakne0ne@gmail.com |
---|---|
State | Accepted |
Commit | 2f3d10a01ac5f613f80db8542bf3ecda1dd40d79 |
Headers | show |
Hi, On Sun, Mar 12, 2017 at 6:06 PM, Ilia <zakne0ne@gmail.com> wrote: > vp9_diag_downleft_16x16_10bpp_c: 263.0 > vp9_diag_downleft_16x16_10bpp_sse2: 44.7 > vp9_diag_downleft_16x16_10bpp_ssse3: 32.5 > vp9_diag_downleft_16x16_10bpp_avx: 31.9 > vp9_diag_downleft_16x16_10bpp_avx2: 25.7 > vp9_diag_downleft_16x16_12bpp_c: 264.7 > vp9_diag_downleft_16x16_12bpp_sse2: 44.4 > vp9_diag_downleft_16x16_12bpp_ssse3: 32.0 > vp9_diag_downleft_16x16_12bpp_avx: 32.4 > vp9_diag_downleft_16x16_12bpp_avx2: 25.5 > > Benchmarked with 10000 runs > > Signed-off-by: Ilia <zakne0ne@gmail.com> > --- > libavcodec/x86/vp9dsp_init_16bpp.c | 2 ++ > libavcodec/x86/vp9intrapred_16bpp.asm | 39 ++++++++++++++++++++++++++++++ > +++++ > 2 files changed, 41 insertions(+) > > diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c > b/libavcodec/x86/vp9dsp_init_16bpp.c > index eb67499..4576ff1 100644 > --- a/libavcodec/x86/vp9dsp_init_16bpp.c > +++ b/libavcodec/x86/vp9dsp_init_16bpp.c > @@ -51,6 +51,7 @@ decl_ipred_fns(h, 16, mmxext, sse2); > decl_ipred_fns(dc, 16, mmxext, sse2); > decl_ipred_fns(dc_top, 16, mmxext, sse2); > decl_ipred_fns(dc_left, 16, mmxext, sse2); > +decl_ipred_fn(dl, 16, 16, avx2); > > #define decl_ipred_dir_funcs(type) \ > decl_ipred_fns(type, 16, sse2, sse2); \ > @@ -133,6 +134,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext > *dsp) > init_fpel_func(2, 1, 32, avg, _16, avx2); > init_fpel_func(1, 1, 64, avg, _16, avx2); > init_fpel_func(0, 1, 128, avg, _16, avx2); > + init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2); > } > > #endif /* HAVE_YASM */ > diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm > b/libavcodec/x86/vp9intrapred_16bpp.asm > index c0ac16d..212e413 100644 > --- a/libavcodec/x86/vp9intrapred_16bpp.asm > +++ b/libavcodec/x86/vp9intrapred_16bpp.asm > @@ -847,6 +847,45 @@ DL_FUNCS > INIT_XMM avx > DL_FUNCS > > +%if HAVE_AVX2_EXTERNAL > +INIT_YMM avx2 > +cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a > + movifnidn aq, amp > + mova m0, [aq] ; abcdefghijklmnop > + vpbroadcastw xm1, [aq+30] ; pppppppp > + vperm2i128 m2, m0, m1, q0201 ; ijklmnoppppppppp > + vpalignr m3, m2, m0, 2 ; bcdefghijklmnopp > + vpalignr m4, m2, m0, 4 ; cdefghijklmnoppp > + LOWPASS 0, 3, 4 ; BCDEFGHIJKLMNOPp > + vperm2i128 m2, m0, m1, q0201 ; JKLMNOPppppppppp > + DEFINE_ARGS dst, stride, stride3, cnt > + mov cntd, 2 > + lea stride3q, [strideq*3] > +.loop: > + mova [dstq+strideq*0], m0 > + vpalignr m3, m2, m0, 2 > + vpalignr m4, m2, m0, 4 > + mova [dstq+strideq*1], m3 > + mova [dstq+strideq*2], m4 > + vpalignr m3, m2, m0, 6 > + vpalignr m4, m2, m0, 8 > + mova [dstq+stride3q ], m3 > + lea dstq, [dstq+strideq*4] > + mova [dstq+strideq*0], m4 > + vpalignr m3, m2, m0, 10 > + vpalignr m4, m2, m0, 12 > + mova [dstq+strideq*1], m3 > + mova [dstq+strideq*2], m4 > + vpalignr m3, m2, m0, 14 > + mova [dstq+stride3q ], m3 > + lea dstq, [dstq+strideq*4] > + mova m0, m2 > + vperm2i128 m2, m2, m2, q0101 ; pppppppppppppppp > + dec cntd > + jg .loop > + RET > +%endif > + > %macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function > cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a > movh m0, [lq] ; wxyz.... > -- > 2.8.3 Pushed. Ronald
diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c index eb67499..4576ff1 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp.c +++ b/libavcodec/x86/vp9dsp_init_16bpp.c @@ -51,6 +51,7 @@ decl_ipred_fns(h, 16, mmxext, sse2); decl_ipred_fns(dc, 16, mmxext, sse2); decl_ipred_fns(dc_top, 16, mmxext, sse2); decl_ipred_fns(dc_left, 16, mmxext, sse2); +decl_ipred_fn(dl, 16, 16, avx2); #define decl_ipred_dir_funcs(type) \ decl_ipred_fns(type, 16, sse2, sse2); \ @@ -133,6 +134,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) init_fpel_func(2, 1, 32, avg, _16, avx2); init_fpel_func(1, 1, 64, avg, _16, avx2); init_fpel_func(0, 1, 128, avg, _16, avx2); + init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2); } #endif /* HAVE_YASM */ diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm index c0ac16d..212e413 100644 --- a/libavcodec/x86/vp9intrapred_16bpp.asm +++ b/libavcodec/x86/vp9intrapred_16bpp.asm @@ -847,6 +847,45 @@ DL_FUNCS INIT_XMM avx DL_FUNCS +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] ; abcdefghijklmnop + vpbroadcastw xm1, [aq+30] ; pppppppp + vperm2i128 m2, m0, m1, q0201 ; ijklmnoppppppppp + vpalignr m3, m2, m0, 2 ; bcdefghijklmnopp + vpalignr m4, m2, m0, 4 ; cdefghijklmnoppp + LOWPASS 0, 3, 4 ; BCDEFGHIJKLMNOPp + vperm2i128 m2, m0, m1, q0201 ; JKLMNOPppppppppp + DEFINE_ARGS dst, stride, stride3, cnt + mov cntd, 2 + lea stride3q, [strideq*3] +.loop: + mova [dstq+strideq*0], m0 + vpalignr m3, m2, m0, 2 + vpalignr m4, m2, m0, 4 + mova [dstq+strideq*1], m3 + mova [dstq+strideq*2], m4 + vpalignr m3, m2, m0, 6 + vpalignr m4, m2, m0, 8 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m4 + vpalignr m3, m2, m0, 10 + vpalignr m4, m2, m0, 12 + mova [dstq+strideq*1], m3 + mova [dstq+strideq*2], m4 + vpalignr m3, m2, m0, 14 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + mova m0, m2 + vperm2i128 m2, m2, m2, q0101 ; pppppppppppppppp + dec cntd + jg .loop + RET +%endif + %macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a movh m0, [lq] ; wxyz....
vp9_diag_downleft_16x16_10bpp_c: 263.0 vp9_diag_downleft_16x16_10bpp_sse2: 44.7 vp9_diag_downleft_16x16_10bpp_ssse3: 32.5 vp9_diag_downleft_16x16_10bpp_avx: 31.9 vp9_diag_downleft_16x16_10bpp_avx2: 25.7 vp9_diag_downleft_16x16_12bpp_c: 264.7 vp9_diag_downleft_16x16_12bpp_sse2: 44.4 vp9_diag_downleft_16x16_12bpp_ssse3: 32.0 vp9_diag_downleft_16x16_12bpp_avx: 32.4 vp9_diag_downleft_16x16_12bpp_avx2: 25.5 Benchmarked with 10000 runs Signed-off-by: Ilia <zakne0ne@gmail.com> --- libavcodec/x86/vp9dsp_init_16bpp.c | 2 ++ libavcodec/x86/vp9intrapred_16bpp.asm | 39 +++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+)