diff mbox

[FFmpeg-devel] avcodec/vp9: ipred_dr_16x16_16 avx2 implementation

Message ID 20170608150824.3092-1-zakne0ne@gmail.com
State Superseded
Headers show

Commit Message

Ilia June 8, 2017, 3:08 p.m. UTC
vp9_diag_downright_16x16_12bpp_c: 149.0
vp9_diag_downright_16x16_12bpp_sse2: 67.8
vp9_diag_downright_16x16_12bpp_ssse3: 45.6
vp9_diag_downright_16x16_12bpp_avx: 36.6
vp9_diag_downright_16x16_12bpp_avx2: 25.5

~30% faster than avx

Signed-off-by: Ilia Valiakhmetov <zakne0ne@gmail.com>
---
 libavcodec/x86/vp9dsp_init_16bpp.c    |  2 ++
 libavcodec/x86/vp9intrapred_16bpp.asm | 56 +++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+)

Comments

Ivan Kalvachev June 9, 2017, 11:03 a.m. UTC | #1
On 6/8/17, Ilia Valiakhmetov <zakne0ne@gmail.com> wrote:
> vp9_diag_downright_16x16_12bpp_c: 149.0
> vp9_diag_downright_16x16_12bpp_sse2: 67.8
> vp9_diag_downright_16x16_12bpp_ssse3: 45.6
> vp9_diag_downright_16x16_12bpp_avx: 36.6
> vp9_diag_downright_16x16_12bpp_avx2: 25.5
>
> ~30% faster than avx
>
> Signed-off-by: Ilia Valiakhmetov <zakne0ne@gmail.com>
> ---
>  libavcodec/x86/vp9dsp_init_16bpp.c    |  2 ++
>  libavcodec/x86/vp9intrapred_16bpp.asm | 56
> +++++++++++++++++++++++++++++++++++
>  2 files changed, 58 insertions(+)
>
> diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c
> b/libavcodec/x86/vp9dsp_init_16bpp.c
> index d1b8fcd..8d1aa13 100644
> --- a/libavcodec/x86/vp9dsp_init_16bpp.c
> +++ b/libavcodec/x86/vp9dsp_init_16bpp.c
> @@ -52,6 +52,7 @@ decl_ipred_fns(dc,      16, mmxext, sse2);
>  decl_ipred_fns(dc_top,  16, mmxext, sse2);
>  decl_ipred_fns(dc_left, 16, mmxext, sse2);
>  decl_ipred_fn(dl,       16,     16, avx2);
> +decl_ipred_fn(dr,       16,     16, avx2);
>  decl_ipred_fn(dl,       32,     16, avx2);
>
>  #define decl_ipred_dir_funcs(type) \
> @@ -136,6 +137,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext
> *dsp)
>          init_fpel_func(1, 1,  64, avg, _16, avx2);
>          init_fpel_func(0, 1, 128, avg, _16, avx2);
>          init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
> +        init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2);
>          init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2);
>      }
>
> diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm
> b/libavcodec/x86/vp9intrapred_16bpp.asm
> index 92333bc..67b98b1 100644
> --- a/libavcodec/x86/vp9intrapred_16bpp.asm
> +++ b/libavcodec/x86/vp9intrapred_16bpp.asm
> @@ -1170,6 +1170,62 @@ DR_FUNCS 2
>  INIT_XMM avx
>  DR_FUNCS 2
>
> +%if HAVE_AVX2_EXTERNAL
> +INIT_YMM avx2
> +cglobal vp9_ipred_dr_16x16_16, 4, 6, 7, dst, stride, l, a
> +    mova                    m0, [lq]                   ; klmnopqrstuvwxyz
> +    movu                    m1, [aq-2]                 ; *abcdefghijklmno
> +    mova                    m2, [aq]                   ; abcdefghijklmnop

I know unaligned loads are not as slow as they used to be,
but could m1 be produced by m2 and palignr?

From the comment I assume you don't use the extra two bytes
that you get from the load, as you mark them as "*"
generic undefined values

> +    vperm2i128              m4, m2, m2, q2001          ; ijklmnop........
> +    vpalignr                m5, m4, m2, 2              ; bcdefghijklmnop.
> +    vperm2i128              m3, m0, m1, q0201          ; stuvwxyz*abcdefg
> +    LOWPASS                  1,  2,  5                 ; ABCDEFGHIJKLMNO.
> +    vpalignr                m4, m3, m0, 2              ; lmnopqrstuvwxyz*
> +    vpalignr                m5, m3, m0, 4              ; mnopqrstuvwxyz*a
> +    LOWPASS                  0,  4,  5                 ; LMNOPQRSTUVWXYZ#
> +    vperm2i128              m5, m0, m1, q0201          ; TUVWXYZ#ABCDEFGH
> +    DEFINE_ARGS dst, stride, stride3, stride5, dst3, cnt

"cnt" doesn't seem to be used.
Ilia June 9, 2017, 11:48 a.m. UTC | #2
>I know unaligned loads are not as slow as they used to be,
>but could m1 be produced by m2 and palignr?

I am not sure, can you clarify your question?

>From the comment I assume you don't use the extra two bytes
>that you get from the load, as you mark them as "*"
>generic undefined values

No, those two extra bytes are actually used, that's the above/left corner
pixel.
If you look in the vp9dsp_template.c file, there is a macro defined
diag_downright_
that's top[-1] in the body. Sorry for this ambiguous marking, but it's used
in
other ipred_dr functions so I decided to follow it.

>"cnt" doesn't seem to be used.

Yes indeed, I mislooked that, thanks.

On Fri, Jun 9, 2017 at 6:03 PM, Ivan Kalvachev <ikalvachev@gmail.com> wrote:

> On 6/8/17, Ilia Valiakhmetov <zakne0ne@gmail.com> wrote:
> > vp9_diag_downright_16x16_12bpp_c: 149.0
> > vp9_diag_downright_16x16_12bpp_sse2: 67.8
> > vp9_diag_downright_16x16_12bpp_ssse3: 45.6
> > vp9_diag_downright_16x16_12bpp_avx: 36.6
> > vp9_diag_downright_16x16_12bpp_avx2: 25.5
> >
> > ~30% faster than avx
> >
> > Signed-off-by: Ilia Valiakhmetov <zakne0ne@gmail.com>
> > ---
> >  libavcodec/x86/vp9dsp_init_16bpp.c    |  2 ++
> >  libavcodec/x86/vp9intrapred_16bpp.asm | 56
> > +++++++++++++++++++++++++++++++++++
> >  2 files changed, 58 insertions(+)
> >
> > diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c
> > b/libavcodec/x86/vp9dsp_init_16bpp.c
> > index d1b8fcd..8d1aa13 100644
> > --- a/libavcodec/x86/vp9dsp_init_16bpp.c
> > +++ b/libavcodec/x86/vp9dsp_init_16bpp.c
> > @@ -52,6 +52,7 @@ decl_ipred_fns(dc,      16, mmxext, sse2);
> >  decl_ipred_fns(dc_top,  16, mmxext, sse2);
> >  decl_ipred_fns(dc_left, 16, mmxext, sse2);
> >  decl_ipred_fn(dl,       16,     16, avx2);
> > +decl_ipred_fn(dr,       16,     16, avx2);
> >  decl_ipred_fn(dl,       32,     16, avx2);
> >
> >  #define decl_ipred_dir_funcs(type) \
> > @@ -136,6 +137,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext
> > *dsp)
> >          init_fpel_func(1, 1,  64, avg, _16, avx2);
> >          init_fpel_func(0, 1, 128, avg, _16, avx2);
> >          init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
> > +        init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2);
> >          init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2);
> >      }
> >
> > diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm
> > b/libavcodec/x86/vp9intrapred_16bpp.asm
> > index 92333bc..67b98b1 100644
> > --- a/libavcodec/x86/vp9intrapred_16bpp.asm
> > +++ b/libavcodec/x86/vp9intrapred_16bpp.asm
> > @@ -1170,6 +1170,62 @@ DR_FUNCS 2
> >  INIT_XMM avx
> >  DR_FUNCS 2
> >
> > +%if HAVE_AVX2_EXTERNAL
> > +INIT_YMM avx2
> > +cglobal vp9_ipred_dr_16x16_16, 4, 6, 7, dst, stride, l, a
> > +    mova                    m0, [lq]                   ;
> klmnopqrstuvwxyz
> > +    movu                    m1, [aq-2]                 ;
> *abcdefghijklmno
> > +    mova                    m2, [aq]                   ;
> abcdefghijklmnop
>


> From the comment I assume you don't use the extra two bytes
> that you get from the load, as you mark them as "*"
> generic undefined values
>
> > +    vperm2i128              m4, m2, m2, q2001          ;
> ijklmnop........
> > +    vpalignr                m5, m4, m2, 2              ;
> bcdefghijklmnop.
> > +    vperm2i128              m3, m0, m1, q0201          ;
> stuvwxyz*abcdefg
> > +    LOWPASS                  1,  2,  5                 ;
> ABCDEFGHIJKLMNO.
> > +    vpalignr                m4, m3, m0, 2              ;
> lmnopqrstuvwxyz*
> > +    vpalignr                m5, m3, m0, 4              ;
> mnopqrstuvwxyz*a
> > +    LOWPASS                  0,  4,  5                 ;
> LMNOPQRSTUVWXYZ#
> > +    vperm2i128              m5, m0, m1, q0201          ;
> TUVWXYZ#ABCDEFGH
> > +    DEFINE_ARGS dst, stride, stride3, stride5, dst3, cnt
>
> "cnt" doesn't seem to be used.
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
diff mbox

Patch

diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c
index d1b8fcd..8d1aa13 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -52,6 +52,7 @@  decl_ipred_fns(dc,      16, mmxext, sse2);
 decl_ipred_fns(dc_top,  16, mmxext, sse2);
 decl_ipred_fns(dc_left, 16, mmxext, sse2);
 decl_ipred_fn(dl,       16,     16, avx2);
+decl_ipred_fn(dr,       16,     16, avx2);
 decl_ipred_fn(dl,       32,     16, avx2);
 
 #define decl_ipred_dir_funcs(type) \
@@ -136,6 +137,7 @@  av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
         init_fpel_func(1, 1,  64, avg, _16, avx2);
         init_fpel_func(0, 1, 128, avg, _16, avx2);
         init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
+        init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2);
         init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2);
     }
 
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm
index 92333bc..67b98b1 100644
--- a/libavcodec/x86/vp9intrapred_16bpp.asm
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -1170,6 +1170,62 @@  DR_FUNCS 2
 INIT_XMM avx
 DR_FUNCS 2
 
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dr_16x16_16, 4, 6, 7, dst, stride, l, a
+    mova                    m0, [lq]                   ; klmnopqrstuvwxyz
+    movu                    m1, [aq-2]                 ; *abcdefghijklmno
+    mova                    m2, [aq]                   ; abcdefghijklmnop
+    vperm2i128              m4, m2, m2, q2001          ; ijklmnop........
+    vpalignr                m5, m4, m2, 2              ; bcdefghijklmnop.
+    vperm2i128              m3, m0, m1, q0201          ; stuvwxyz*abcdefg
+    LOWPASS                  1,  2,  5                 ; ABCDEFGHIJKLMNO.
+    vpalignr                m4, m3, m0, 2              ; lmnopqrstuvwxyz*
+    vpalignr                m5, m3, m0, 4              ; mnopqrstuvwxyz*a
+    LOWPASS                  0,  4,  5                 ; LMNOPQRSTUVWXYZ#
+    vperm2i128              m5, m0, m1, q0201          ; TUVWXYZ#ABCDEFGH
+    DEFINE_ARGS dst, stride, stride3, stride5, dst3, cnt
+    lea                  dst3q, [dstq+strideq*4]
+    lea               stride3q, [strideq*3]
+    lea               stride5q, [stride3q+strideq*2]
+
+    vpalignr                m3, m5, m0, 2
+    vpalignr                m4, m1, m5, 2
+    mova    [dst3q+stride5q*2], m3                     ; 14
+    mova    [ dstq+stride3q*2], m4                     ; 6
+    vpalignr                m3, m5, m0, 4
+    vpalignr                m4, m1, m5, 4
+    sub                  dst3q, strideq
+    mova    [dst3q+stride5q*2], m3                     ; 13
+    mova    [dst3q+strideq*2 ], m4                     ; 5
+    mova    [dst3q+stride3q*4], m0                     ; 15
+    vpalignr                m3, m5, m0, 6
+    vpalignr                m4, m1, m5, 6
+    mova     [dstq+stride3q*4], m3                     ; 12
+    mova     [dst3q+strideq*1], m4                     ; 4
+    vpalignr                m3, m5, m0, 8
+    vpalignr                m4, m1, m5, 8
+    mova     [dst3q+strideq*8], m3                     ; 11
+    mova     [dst3q+strideq*0], m4                     ; 3
+    vpalignr                m3, m5, m0, 12
+    vpalignr                m4, m1, m5, 12
+    mova    [dst3q+stride3q*2], m3                     ; 9
+    mova     [dstq+strideq*1 ], m4                     ; 1
+    vpalignr                m3, m5, m0, 10
+    vpalignr                m4, m1, m5, 10
+    mova     [dstq+stride5q*2], m3                     ; 10
+    mova     [dstq+strideq*2 ], m4                     ; 2
+    vpalignr                m3, m5, m0, 14
+    vpalignr                m4, m1, m5, 14
+    mova      [dstq+strideq*8], m3                     ; 8
+    mova      [dstq+strideq*0], m4                     ; 0
+    sub                   dstq, strideq
+    mova     [dst3q+strideq*4], m5                     ; 7
+    mova     [ dstq+strideq*0], m1                     ; -1
+    RET
+%endif
+
+
 %macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function
 cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a
     movifnidn               aq, amp