Message ID | 20170608150824.3092-1-zakne0ne@gmail.com |
---|---|
State | Superseded |
Headers | show |
On 6/8/17, Ilia Valiakhmetov <zakne0ne@gmail.com> wrote: > vp9_diag_downright_16x16_12bpp_c: 149.0 > vp9_diag_downright_16x16_12bpp_sse2: 67.8 > vp9_diag_downright_16x16_12bpp_ssse3: 45.6 > vp9_diag_downright_16x16_12bpp_avx: 36.6 > vp9_diag_downright_16x16_12bpp_avx2: 25.5 > > ~30% faster than avx > > Signed-off-by: Ilia Valiakhmetov <zakne0ne@gmail.com> > --- > libavcodec/x86/vp9dsp_init_16bpp.c | 2 ++ > libavcodec/x86/vp9intrapred_16bpp.asm | 56 > +++++++++++++++++++++++++++++++++++ > 2 files changed, 58 insertions(+) > > diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c > b/libavcodec/x86/vp9dsp_init_16bpp.c > index d1b8fcd..8d1aa13 100644 > --- a/libavcodec/x86/vp9dsp_init_16bpp.c > +++ b/libavcodec/x86/vp9dsp_init_16bpp.c > @@ -52,6 +52,7 @@ decl_ipred_fns(dc, 16, mmxext, sse2); > decl_ipred_fns(dc_top, 16, mmxext, sse2); > decl_ipred_fns(dc_left, 16, mmxext, sse2); > decl_ipred_fn(dl, 16, 16, avx2); > +decl_ipred_fn(dr, 16, 16, avx2); > decl_ipred_fn(dl, 32, 16, avx2); > > #define decl_ipred_dir_funcs(type) \ > @@ -136,6 +137,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext > *dsp) > init_fpel_func(1, 1, 64, avg, _16, avx2); > init_fpel_func(0, 1, 128, avg, _16, avx2); > init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2); > + init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2); > init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2); > } > > diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm > b/libavcodec/x86/vp9intrapred_16bpp.asm > index 92333bc..67b98b1 100644 > --- a/libavcodec/x86/vp9intrapred_16bpp.asm > +++ b/libavcodec/x86/vp9intrapred_16bpp.asm > @@ -1170,6 +1170,62 @@ DR_FUNCS 2 > INIT_XMM avx > DR_FUNCS 2 > > +%if HAVE_AVX2_EXTERNAL > +INIT_YMM avx2 > +cglobal vp9_ipred_dr_16x16_16, 4, 6, 7, dst, stride, l, a > + mova m0, [lq] ; klmnopqrstuvwxyz > + movu m1, [aq-2] ; *abcdefghijklmno > + mova m2, [aq] ; abcdefghijklmnop I know unaligned loads are not as slow as they used to be, but could m1 be produced by m2 and palignr? From the comment I assume you don't use the extra two bytes that you get from the load, as you mark them as "*" generic undefined values > + vperm2i128 m4, m2, m2, q2001 ; ijklmnop........ > + vpalignr m5, m4, m2, 2 ; bcdefghijklmnop. > + vperm2i128 m3, m0, m1, q0201 ; stuvwxyz*abcdefg > + LOWPASS 1, 2, 5 ; ABCDEFGHIJKLMNO. > + vpalignr m4, m3, m0, 2 ; lmnopqrstuvwxyz* > + vpalignr m5, m3, m0, 4 ; mnopqrstuvwxyz*a > + LOWPASS 0, 4, 5 ; LMNOPQRSTUVWXYZ# > + vperm2i128 m5, m0, m1, q0201 ; TUVWXYZ#ABCDEFGH > + DEFINE_ARGS dst, stride, stride3, stride5, dst3, cnt "cnt" doesn't seem to be used.
>I know unaligned loads are not as slow as they used to be, >but could m1 be produced by m2 and palignr? I am not sure, can you clarify your question? >From the comment I assume you don't use the extra two bytes >that you get from the load, as you mark them as "*" >generic undefined values No, those two extra bytes are actually used, that's the above/left corner pixel. If you look in the vp9dsp_template.c file, there is a macro defined diag_downright_ that's top[-1] in the body. Sorry for this ambiguous marking, but it's used in other ipred_dr functions so I decided to follow it. >"cnt" doesn't seem to be used. Yes indeed, I mislooked that, thanks. On Fri, Jun 9, 2017 at 6:03 PM, Ivan Kalvachev <ikalvachev@gmail.com> wrote: > On 6/8/17, Ilia Valiakhmetov <zakne0ne@gmail.com> wrote: > > vp9_diag_downright_16x16_12bpp_c: 149.0 > > vp9_diag_downright_16x16_12bpp_sse2: 67.8 > > vp9_diag_downright_16x16_12bpp_ssse3: 45.6 > > vp9_diag_downright_16x16_12bpp_avx: 36.6 > > vp9_diag_downright_16x16_12bpp_avx2: 25.5 > > > > ~30% faster than avx > > > > Signed-off-by: Ilia Valiakhmetov <zakne0ne@gmail.com> > > --- > > libavcodec/x86/vp9dsp_init_16bpp.c | 2 ++ > > libavcodec/x86/vp9intrapred_16bpp.asm | 56 > > +++++++++++++++++++++++++++++++++++ > > 2 files changed, 58 insertions(+) > > > > diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c > > b/libavcodec/x86/vp9dsp_init_16bpp.c > > index d1b8fcd..8d1aa13 100644 > > --- a/libavcodec/x86/vp9dsp_init_16bpp.c > > +++ b/libavcodec/x86/vp9dsp_init_16bpp.c > > @@ -52,6 +52,7 @@ decl_ipred_fns(dc, 16, mmxext, sse2); > > decl_ipred_fns(dc_top, 16, mmxext, sse2); > > decl_ipred_fns(dc_left, 16, mmxext, sse2); > > decl_ipred_fn(dl, 16, 16, avx2); > > +decl_ipred_fn(dr, 16, 16, avx2); > > decl_ipred_fn(dl, 32, 16, avx2); > > > > #define decl_ipred_dir_funcs(type) \ > > @@ -136,6 +137,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext > > *dsp) > > init_fpel_func(1, 1, 64, avg, _16, avx2); > > init_fpel_func(0, 1, 128, avg, _16, avx2); > > init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2); > > + init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2); > > init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2); > > } > > > > diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm > > b/libavcodec/x86/vp9intrapred_16bpp.asm > > index 92333bc..67b98b1 100644 > > --- a/libavcodec/x86/vp9intrapred_16bpp.asm > > +++ b/libavcodec/x86/vp9intrapred_16bpp.asm > > @@ -1170,6 +1170,62 @@ DR_FUNCS 2 > > INIT_XMM avx > > DR_FUNCS 2 > > > > +%if HAVE_AVX2_EXTERNAL > > +INIT_YMM avx2 > > +cglobal vp9_ipred_dr_16x16_16, 4, 6, 7, dst, stride, l, a > > + mova m0, [lq] ; > klmnopqrstuvwxyz > > + movu m1, [aq-2] ; > *abcdefghijklmno > > + mova m2, [aq] ; > abcdefghijklmnop > > From the comment I assume you don't use the extra two bytes > that you get from the load, as you mark them as "*" > generic undefined values > > > + vperm2i128 m4, m2, m2, q2001 ; > ijklmnop........ > > + vpalignr m5, m4, m2, 2 ; > bcdefghijklmnop. > > + vperm2i128 m3, m0, m1, q0201 ; > stuvwxyz*abcdefg > > + LOWPASS 1, 2, 5 ; > ABCDEFGHIJKLMNO. > > + vpalignr m4, m3, m0, 2 ; > lmnopqrstuvwxyz* > > + vpalignr m5, m3, m0, 4 ; > mnopqrstuvwxyz*a > > + LOWPASS 0, 4, 5 ; > LMNOPQRSTUVWXYZ# > > + vperm2i128 m5, m0, m1, q0201 ; > TUVWXYZ#ABCDEFGH > > + DEFINE_ARGS dst, stride, stride3, stride5, dst3, cnt > > "cnt" doesn't seem to be used. > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel >
diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c index d1b8fcd..8d1aa13 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp.c +++ b/libavcodec/x86/vp9dsp_init_16bpp.c @@ -52,6 +52,7 @@ decl_ipred_fns(dc, 16, mmxext, sse2); decl_ipred_fns(dc_top, 16, mmxext, sse2); decl_ipred_fns(dc_left, 16, mmxext, sse2); decl_ipred_fn(dl, 16, 16, avx2); +decl_ipred_fn(dr, 16, 16, avx2); decl_ipred_fn(dl, 32, 16, avx2); #define decl_ipred_dir_funcs(type) \ @@ -136,6 +137,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) init_fpel_func(1, 1, 64, avg, _16, avx2); init_fpel_func(0, 1, 128, avg, _16, avx2); init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2); + init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2); init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2); } diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm index 92333bc..67b98b1 100644 --- a/libavcodec/x86/vp9intrapred_16bpp.asm +++ b/libavcodec/x86/vp9intrapred_16bpp.asm @@ -1170,6 +1170,62 @@ DR_FUNCS 2 INIT_XMM avx DR_FUNCS 2 +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_dr_16x16_16, 4, 6, 7, dst, stride, l, a + mova m0, [lq] ; klmnopqrstuvwxyz + movu m1, [aq-2] ; *abcdefghijklmno + mova m2, [aq] ; abcdefghijklmnop + vperm2i128 m4, m2, m2, q2001 ; ijklmnop........ + vpalignr m5, m4, m2, 2 ; bcdefghijklmnop. + vperm2i128 m3, m0, m1, q0201 ; stuvwxyz*abcdefg + LOWPASS 1, 2, 5 ; ABCDEFGHIJKLMNO. + vpalignr m4, m3, m0, 2 ; lmnopqrstuvwxyz* + vpalignr m5, m3, m0, 4 ; mnopqrstuvwxyz*a + LOWPASS 0, 4, 5 ; LMNOPQRSTUVWXYZ# + vperm2i128 m5, m0, m1, q0201 ; TUVWXYZ#ABCDEFGH + DEFINE_ARGS dst, stride, stride3, stride5, dst3, cnt + lea dst3q, [dstq+strideq*4] + lea stride3q, [strideq*3] + lea stride5q, [stride3q+strideq*2] + + vpalignr m3, m5, m0, 2 + vpalignr m4, m1, m5, 2 + mova [dst3q+stride5q*2], m3 ; 14 + mova [ dstq+stride3q*2], m4 ; 6 + vpalignr m3, m5, m0, 4 + vpalignr m4, m1, m5, 4 + sub dst3q, strideq + mova [dst3q+stride5q*2], m3 ; 13 + mova [dst3q+strideq*2 ], m4 ; 5 + mova [dst3q+stride3q*4], m0 ; 15 + vpalignr m3, m5, m0, 6 + vpalignr m4, m1, m5, 6 + mova [dstq+stride3q*4], m3 ; 12 + mova [dst3q+strideq*1], m4 ; 4 + vpalignr m3, m5, m0, 8 + vpalignr m4, m1, m5, 8 + mova [dst3q+strideq*8], m3 ; 11 + mova [dst3q+strideq*0], m4 ; 3 + vpalignr m3, m5, m0, 12 + vpalignr m4, m1, m5, 12 + mova [dst3q+stride3q*2], m3 ; 9 + mova [dstq+strideq*1 ], m4 ; 1 + vpalignr m3, m5, m0, 10 + vpalignr m4, m1, m5, 10 + mova [dstq+stride5q*2], m3 ; 10 + mova [dstq+strideq*2 ], m4 ; 2 + vpalignr m3, m5, m0, 14 + vpalignr m4, m1, m5, 14 + mova [dstq+strideq*8], m3 ; 8 + mova [dstq+strideq*0], m4 ; 0 + sub dstq, strideq + mova [dst3q+strideq*4], m5 ; 7 + mova [ dstq+strideq*0], m1 ; -1 + RET +%endif + + %macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a movifnidn aq, amp
vp9_diag_downright_16x16_12bpp_c: 149.0 vp9_diag_downright_16x16_12bpp_sse2: 67.8 vp9_diag_downright_16x16_12bpp_ssse3: 45.6 vp9_diag_downright_16x16_12bpp_avx: 36.6 vp9_diag_downright_16x16_12bpp_avx2: 25.5 ~30% faster than avx Signed-off-by: Ilia Valiakhmetov <zakne0ne@gmail.com> --- libavcodec/x86/vp9dsp_init_16bpp.c | 2 ++ libavcodec/x86/vp9intrapred_16bpp.asm | 56 +++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+)