Message ID | 20170604175227.3296-1-zakne0ne@gmail.com |
---|---|
State | New |
Headers | show |
Hi, On Sun, Jun 4, 2017 at 1:52 PM, Ilia Valiakhmetov <zakne0ne@gmail.com> wrote: > vp9_diag_downleft_32x32_8bpp_c: 580.2 > vp9_diag_downleft_32x32_8bpp_sse2: 75.6 > vp9_diag_downleft_32x32_8bpp_ssse3: 73.7 > vp9_diag_downleft_32x32_8bpp_avx: 72.7 > vp9_diag_downleft_32x32_10bpp_c: 1101.2 > vp9_diag_downleft_32x32_10bpp_sse2: 145.4 > vp9_diag_downleft_32x32_10bpp_ssse3: 137.5 > vp9_diag_downleft_32x32_10bpp_avx: 134.8 > vp9_diag_downleft_32x32_10bpp_avx2: 94.0 > vp9_diag_downleft_32x32_12bpp_c: 1108.5 > vp9_diag_downleft_32x32_12bpp_sse2: 145.5 > vp9_diag_downleft_32x32_12bpp_ssse3: 137.3 > vp9_diag_downleft_32x32_12bpp_avx: 135.2 > vp9_diag_downleft_32x32_12bpp_avx2: 94.0 > > ~30% faster than avx implementation > > --- > libavcodec/x86/vp9dsp_init_16bpp.c | 2 ++ > libavcodec/x86/vp9intrapred_16bpp.asm | 63 ++++++++++++++++++++++++++++++ > +++++ > 2 files changed, 65 insertions(+) LGTM. I'll keep for comments for another few hours before I push. Ronald
On 6/4/2017 2:52 PM, Ilia Valiakhmetov wrote: > vp9_diag_downleft_32x32_8bpp_c: 580.2 > vp9_diag_downleft_32x32_8bpp_sse2: 75.6 > vp9_diag_downleft_32x32_8bpp_ssse3: 73.7 > vp9_diag_downleft_32x32_8bpp_avx: 72.7 > vp9_diag_downleft_32x32_10bpp_c: 1101.2 > vp9_diag_downleft_32x32_10bpp_sse2: 145.4 > vp9_diag_downleft_32x32_10bpp_ssse3: 137.5 > vp9_diag_downleft_32x32_10bpp_avx: 134.8 > vp9_diag_downleft_32x32_10bpp_avx2: 94.0 > vp9_diag_downleft_32x32_12bpp_c: 1108.5 > vp9_diag_downleft_32x32_12bpp_sse2: 145.5 > vp9_diag_downleft_32x32_12bpp_ssse3: 137.3 > vp9_diag_downleft_32x32_12bpp_avx: 135.2 > vp9_diag_downleft_32x32_12bpp_avx2: 94.0 > > ~30% faster than avx implementation Nice. > > --- > libavcodec/x86/vp9dsp_init_16bpp.c | 2 ++ > libavcodec/x86/vp9intrapred_16bpp.asm | 63 +++++++++++++++++++++++++++++++++++ > 2 files changed, 65 insertions(+) > > diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c > index 4576ff1..d1b8fcd 100644 > --- a/libavcodec/x86/vp9dsp_init_16bpp.c > +++ b/libavcodec/x86/vp9dsp_init_16bpp.c > @@ -52,6 +52,7 @@ decl_ipred_fns(dc, 16, mmxext, sse2); > decl_ipred_fns(dc_top, 16, mmxext, sse2); > decl_ipred_fns(dc_left, 16, mmxext, sse2); > decl_ipred_fn(dl, 16, 16, avx2); > +decl_ipred_fn(dl, 32, 16, avx2); > > #define decl_ipred_dir_funcs(type) \ > decl_ipred_fns(type, 16, sse2, sse2); \ > @@ -135,6 +136,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) > init_fpel_func(1, 1, 64, avg, _16, avx2); > init_fpel_func(0, 1, 128, avg, _16, avx2); > init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2); > + init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2); > } > > #endif /* HAVE_YASM */ > diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm > index 212e413..5cd6a3e 100644 > --- a/libavcodec/x86/vp9intrapred_16bpp.asm > +++ b/libavcodec/x86/vp9intrapred_16bpp.asm > @@ -861,6 +861,7 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a > DEFINE_ARGS dst, stride, stride3, cnt > mov cntd, 2 > lea stride3q, [strideq*3] > + Trailing whitespaces. > .loop: > mova [dstq+strideq*0], m0 > vpalignr m3, m2, m0, 2 > @@ -884,6 +885,68 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a > dec cntd > jg .loop > RET > + Same. > +cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a > + movifnidn aq, amp > + mova m0, [aq+mmsize*0+ 0] ; abcdefghijklmnop > + mova m1, [aq+mmsize*1+ 0] ; qrstuvwxyz012345 > + vpbroadcastw xm4, [aq+mmsize*1+30] ; 55555555 > + vperm2i128 m5, m0, m1, q0201 ; ijklmnopqrstuvwx > + vpalignr m2, m5, m0, 2 ; bcdefghijklmnopq > + vpalignr m3, m5, m0, 4 ; cdefghijklmnopqr > + LOWPASS 0, 2, 3 ; BCDEFGHIJKLMNOPQ > + vperm2i128 m5, m1, m4, q0201 ; yz01234555555555 > + vpalignr m2, m5, m1, 2 ; rstuvwxyz0123455 > + vpalignr m3, m5, m1, 4 ; stuvwxyz01234555 > + LOWPASS 1, 2, 3 ; RSTUVWXYZ......5 > + vperm2i128 m2, m1, m4, q0201 ; Z......555555555 > + vperm2i128 m5, m0, m1, q0201 ; JKLMNOPQRSTUVWXY > + DEFINE_ARGS dst, stride, stride3, cnt > + lea stride3q, [strideq*3] > + mov cntd, 4 > + Same. Ronald can fix them before pushing (I think the git hooks would prevent him to push this with them anyway), so no need to resend a fixed patch. Just keep it in mind for future patchsets. Same with tabs on files other than Makefile stuff. > +.loop: > + mova [dstq+strideq*0 + 0], m0 > + mova [dstq+strideq*0 +32], m1 > + vpalignr m3, m5, m0, 2 > + vpalignr m4, m2, m1, 2 > + mova [dstq+strideq*1 + 0], m3 > + mova [dstq+strideq*1 +32], m4 > + vpalignr m3, m5, m0, 4 > + vpalignr m4, m2, m1, 4 > + mova [dstq+strideq*2 + 0], m3 > + mova [dstq+strideq*2 +32], m4 > + vpalignr m3, m5, m0, 6 > + vpalignr m4, m2, m1, 6 > + mova [dstq+stride3q*1+ 0], m3 > + mova [dstq+stride3q*1+32], m4 > + lea dstq, [dstq+strideq*4] > + vpalignr m3, m5, m0, 8 > + vpalignr m4, m2, m1, 8 > + mova [dstq+strideq*0 + 0], m3 > + mova [dstq+strideq*0 +32], m4 > + vpalignr m3, m5, m0, 10 > + vpalignr m4, m2, m1, 10 > + mova [dstq+strideq*1 + 0], m3 > + mova [dstq+strideq*1 +32], m4 > + vpalignr m3, m5, m0, 12 > + vpalignr m4, m2, m1, 12 > + mova [dstq+strideq*2+ 0], m3 > + mova [dstq+strideq*2+32], m4 > + vpalignr m3, m5, m0, 14 > + vpalignr m4, m2, m1, 14 > + mova [dstq+stride3q+ 0], m3 > + mova [dstq+stride3q+ 32], m4 > + vpalignr m3, m5, m0, 16 > + vpalignr m4, m2, m1, 16 > + vperm2i128 m5, m3, m4, q0201 > + vperm2i128 m2, m4, m4, q0101 > + mova m0, m3 > + mova m1, m4 > + lea dstq, [dstq+strideq*4] > + dec cntd > + jg .loop > + RET > %endif > > %macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function >
Hi, On Mon, Jun 5, 2017 at 1:41 PM, James Almer <jamrial@gmail.com> wrote: > On 6/4/2017 2:52 PM, Ilia Valiakhmetov wrote: > > vp9_diag_downleft_32x32_8bpp_c: 580.2 > > vp9_diag_downleft_32x32_8bpp_sse2: 75.6 > > vp9_diag_downleft_32x32_8bpp_ssse3: 73.7 > > vp9_diag_downleft_32x32_8bpp_avx: 72.7 > > vp9_diag_downleft_32x32_10bpp_c: 1101.2 > > vp9_diag_downleft_32x32_10bpp_sse2: 145.4 > > vp9_diag_downleft_32x32_10bpp_ssse3: 137.5 > > vp9_diag_downleft_32x32_10bpp_avx: 134.8 > > vp9_diag_downleft_32x32_10bpp_avx2: 94.0 > > vp9_diag_downleft_32x32_12bpp_c: 1108.5 > > vp9_diag_downleft_32x32_12bpp_sse2: 145.5 > > vp9_diag_downleft_32x32_12bpp_ssse3: 137.3 > > vp9_diag_downleft_32x32_12bpp_avx: 135.2 > > vp9_diag_downleft_32x32_12bpp_avx2: 94.0 > > > > ~30% faster than avx implementation > > Nice. > > > > > --- > > libavcodec/x86/vp9dsp_init_16bpp.c | 2 ++ > > libavcodec/x86/vp9intrapred_16bpp.asm | 63 > +++++++++++++++++++++++++++++++++++ > > 2 files changed, 65 insertions(+) > > > > diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c > b/libavcodec/x86/vp9dsp_init_16bpp.c > > index 4576ff1..d1b8fcd 100644 > > --- a/libavcodec/x86/vp9dsp_init_16bpp.c > > +++ b/libavcodec/x86/vp9dsp_init_16bpp.c > > @@ -52,6 +52,7 @@ decl_ipred_fns(dc, 16, mmxext, sse2); > > decl_ipred_fns(dc_top, 16, mmxext, sse2); > > decl_ipred_fns(dc_left, 16, mmxext, sse2); > > decl_ipred_fn(dl, 16, 16, avx2); > > +decl_ipred_fn(dl, 32, 16, avx2); > > > > #define decl_ipred_dir_funcs(type) \ > > decl_ipred_fns(type, 16, sse2, sse2); \ > > @@ -135,6 +136,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext > *dsp) > > init_fpel_func(1, 1, 64, avg, _16, avx2); > > init_fpel_func(0, 1, 128, avg, _16, avx2); > > init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2); > > + init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2); > > } > > > > #endif /* HAVE_YASM */ > > diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm > b/libavcodec/x86/vp9intrapred_16bpp.asm > > index 212e413..5cd6a3e 100644 > > --- a/libavcodec/x86/vp9intrapred_16bpp.asm > > +++ b/libavcodec/x86/vp9intrapred_16bpp.asm > > @@ -861,6 +861,7 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, > l, a > > DEFINE_ARGS dst, stride, stride3, cnt > > mov cntd, 2 > > lea stride3q, [strideq*3] > > + > > Trailing whitespaces. > > > .loop: > > mova [dstq+strideq*0], m0 > > vpalignr m3, m2, m0, 2 > > @@ -884,6 +885,68 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, > stride, l, a > > dec cntd > > jg .loop > > RET > > + > > Same. > > > +cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a > > + movifnidn aq, amp > > + mova m0, [aq+mmsize*0+ 0] ; > abcdefghijklmnop > > + mova m1, [aq+mmsize*1+ 0] ; > qrstuvwxyz012345 > > + vpbroadcastw xm4, [aq+mmsize*1+30] ; 55555555 > > + vperm2i128 m5, m0, m1, q0201 ; > ijklmnopqrstuvwx > > + vpalignr m2, m5, m0, 2 ; > bcdefghijklmnopq > > + vpalignr m3, m5, m0, 4 ; > cdefghijklmnopqr > > + LOWPASS 0, 2, 3 ; > BCDEFGHIJKLMNOPQ > > + vperm2i128 m5, m1, m4, q0201 ; > yz01234555555555 > > + vpalignr m2, m5, m1, 2 ; > rstuvwxyz0123455 > > + vpalignr m3, m5, m1, 4 ; > stuvwxyz01234555 > > + LOWPASS 1, 2, 3 ; > RSTUVWXYZ......5 > > + vperm2i128 m2, m1, m4, q0201 ; > Z......555555555 > > + vperm2i128 m5, m0, m1, q0201 ; > JKLMNOPQRSTUVWXY > > + DEFINE_ARGS dst, stride, stride3, cnt > > + lea stride3q, [strideq*3] > > + mov cntd, 4 > > + > > Same. > > Ronald can fix them before pushing (I think the git hooks would prevent > him to push this with them anyway), so no need to resend a fixed patch. > Just keep it in mind for future patchsets. Same with tabs on files other > than Makefile stuff. Pushed with that fixed. Ronald
diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c index 4576ff1..d1b8fcd 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp.c +++ b/libavcodec/x86/vp9dsp_init_16bpp.c @@ -52,6 +52,7 @@ decl_ipred_fns(dc, 16, mmxext, sse2); decl_ipred_fns(dc_top, 16, mmxext, sse2); decl_ipred_fns(dc_left, 16, mmxext, sse2); decl_ipred_fn(dl, 16, 16, avx2); +decl_ipred_fn(dl, 32, 16, avx2); #define decl_ipred_dir_funcs(type) \ decl_ipred_fns(type, 16, sse2, sse2); \ @@ -135,6 +136,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) init_fpel_func(1, 1, 64, avg, _16, avx2); init_fpel_func(0, 1, 128, avg, _16, avx2); init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2); + init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2); } #endif /* HAVE_YASM */ diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm index 212e413..5cd6a3e 100644 --- a/libavcodec/x86/vp9intrapred_16bpp.asm +++ b/libavcodec/x86/vp9intrapred_16bpp.asm @@ -861,6 +861,7 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a DEFINE_ARGS dst, stride, stride3, cnt mov cntd, 2 lea stride3q, [strideq*3] + .loop: mova [dstq+strideq*0], m0 vpalignr m3, m2, m0, 2 @@ -884,6 +885,68 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a dec cntd jg .loop RET + +cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq+mmsize*0+ 0] ; abcdefghijklmnop + mova m1, [aq+mmsize*1+ 0] ; qrstuvwxyz012345 + vpbroadcastw xm4, [aq+mmsize*1+30] ; 55555555 + vperm2i128 m5, m0, m1, q0201 ; ijklmnopqrstuvwx + vpalignr m2, m5, m0, 2 ; bcdefghijklmnopq + vpalignr m3, m5, m0, 4 ; cdefghijklmnopqr + LOWPASS 0, 2, 3 ; BCDEFGHIJKLMNOPQ + vperm2i128 m5, m1, m4, q0201 ; yz01234555555555 + vpalignr m2, m5, m1, 2 ; rstuvwxyz0123455 + vpalignr m3, m5, m1, 4 ; stuvwxyz01234555 + LOWPASS 1, 2, 3 ; RSTUVWXYZ......5 + vperm2i128 m2, m1, m4, q0201 ; Z......555555555 + vperm2i128 m5, m0, m1, q0201 ; JKLMNOPQRSTUVWXY + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 + +.loop: + mova [dstq+strideq*0 + 0], m0 + mova [dstq+strideq*0 +32], m1 + vpalignr m3, m5, m0, 2 + vpalignr m4, m2, m1, 2 + mova [dstq+strideq*1 + 0], m3 + mova [dstq+strideq*1 +32], m4 + vpalignr m3, m5, m0, 4 + vpalignr m4, m2, m1, 4 + mova [dstq+strideq*2 + 0], m3 + mova [dstq+strideq*2 +32], m4 + vpalignr m3, m5, m0, 6 + vpalignr m4, m2, m1, 6 + mova [dstq+stride3q*1+ 0], m3 + mova [dstq+stride3q*1+32], m4 + lea dstq, [dstq+strideq*4] + vpalignr m3, m5, m0, 8 + vpalignr m4, m2, m1, 8 + mova [dstq+strideq*0 + 0], m3 + mova [dstq+strideq*0 +32], m4 + vpalignr m3, m5, m0, 10 + vpalignr m4, m2, m1, 10 + mova [dstq+strideq*1 + 0], m3 + mova [dstq+strideq*1 +32], m4 + vpalignr m3, m5, m0, 12 + vpalignr m4, m2, m1, 12 + mova [dstq+strideq*2+ 0], m3 + mova [dstq+strideq*2+32], m4 + vpalignr m3, m5, m0, 14 + vpalignr m4, m2, m1, 14 + mova [dstq+stride3q+ 0], m3 + mova [dstq+stride3q+ 32], m4 + vpalignr m3, m5, m0, 16 + vpalignr m4, m2, m1, 16 + vperm2i128 m5, m3, m4, q0201 + vperm2i128 m2, m4, m4, q0101 + mova m0, m3 + mova m1, m4 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET %endif %macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function