[FFmpeg-devel] libavcodec/vp9: ipred_dl_32x32_16 avx2 implementation

Submitted by Ilia on June 4, 2017, 5:52 p.m.

Details

Message ID 20170604175227.3296-1-zakne0ne@gmail.com
State New
Headers show

Commit Message

Ilia June 4, 2017, 5:52 p.m.
vp9_diag_downleft_32x32_8bpp_c: 580.2
vp9_diag_downleft_32x32_8bpp_sse2: 75.6
vp9_diag_downleft_32x32_8bpp_ssse3: 73.7
vp9_diag_downleft_32x32_8bpp_avx: 72.7
vp9_diag_downleft_32x32_10bpp_c: 1101.2
vp9_diag_downleft_32x32_10bpp_sse2: 145.4
vp9_diag_downleft_32x32_10bpp_ssse3: 137.5
vp9_diag_downleft_32x32_10bpp_avx: 134.8
vp9_diag_downleft_32x32_10bpp_avx2: 94.0
vp9_diag_downleft_32x32_12bpp_c: 1108.5
vp9_diag_downleft_32x32_12bpp_sse2: 145.5
vp9_diag_downleft_32x32_12bpp_ssse3: 137.3
vp9_diag_downleft_32x32_12bpp_avx: 135.2
vp9_diag_downleft_32x32_12bpp_avx2: 94.0

~30% faster than avx implementation

---
 libavcodec/x86/vp9dsp_init_16bpp.c    |  2 ++
 libavcodec/x86/vp9intrapred_16bpp.asm | 63 +++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+)

Comments

Ronald S. Bultje June 5, 2017, noon
Hi,

On Sun, Jun 4, 2017 at 1:52 PM, Ilia Valiakhmetov <zakne0ne@gmail.com>
wrote:

> vp9_diag_downleft_32x32_8bpp_c: 580.2
> vp9_diag_downleft_32x32_8bpp_sse2: 75.6
> vp9_diag_downleft_32x32_8bpp_ssse3: 73.7
> vp9_diag_downleft_32x32_8bpp_avx: 72.7
> vp9_diag_downleft_32x32_10bpp_c: 1101.2
> vp9_diag_downleft_32x32_10bpp_sse2: 145.4
> vp9_diag_downleft_32x32_10bpp_ssse3: 137.5
> vp9_diag_downleft_32x32_10bpp_avx: 134.8
> vp9_diag_downleft_32x32_10bpp_avx2: 94.0
> vp9_diag_downleft_32x32_12bpp_c: 1108.5
> vp9_diag_downleft_32x32_12bpp_sse2: 145.5
> vp9_diag_downleft_32x32_12bpp_ssse3: 137.3
> vp9_diag_downleft_32x32_12bpp_avx: 135.2
> vp9_diag_downleft_32x32_12bpp_avx2: 94.0
>
> ~30% faster than avx implementation
>
> ---
>  libavcodec/x86/vp9dsp_init_16bpp.c    |  2 ++
>  libavcodec/x86/vp9intrapred_16bpp.asm | 63 ++++++++++++++++++++++++++++++
> +++++
>  2 files changed, 65 insertions(+)


LGTM. I'll keep for comments for another few hours before I push.

Ronald
James Almer June 5, 2017, 5:41 p.m.
On 6/4/2017 2:52 PM, Ilia Valiakhmetov wrote:
> vp9_diag_downleft_32x32_8bpp_c: 580.2
> vp9_diag_downleft_32x32_8bpp_sse2: 75.6
> vp9_diag_downleft_32x32_8bpp_ssse3: 73.7
> vp9_diag_downleft_32x32_8bpp_avx: 72.7
> vp9_diag_downleft_32x32_10bpp_c: 1101.2
> vp9_diag_downleft_32x32_10bpp_sse2: 145.4
> vp9_diag_downleft_32x32_10bpp_ssse3: 137.5
> vp9_diag_downleft_32x32_10bpp_avx: 134.8
> vp9_diag_downleft_32x32_10bpp_avx2: 94.0
> vp9_diag_downleft_32x32_12bpp_c: 1108.5
> vp9_diag_downleft_32x32_12bpp_sse2: 145.5
> vp9_diag_downleft_32x32_12bpp_ssse3: 137.3
> vp9_diag_downleft_32x32_12bpp_avx: 135.2
> vp9_diag_downleft_32x32_12bpp_avx2: 94.0
> 
> ~30% faster than avx implementation

Nice.

> 
> ---
>  libavcodec/x86/vp9dsp_init_16bpp.c    |  2 ++
>  libavcodec/x86/vp9intrapred_16bpp.asm | 63 +++++++++++++++++++++++++++++++++++
>  2 files changed, 65 insertions(+)
> 
> diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c
> index 4576ff1..d1b8fcd 100644
> --- a/libavcodec/x86/vp9dsp_init_16bpp.c
> +++ b/libavcodec/x86/vp9dsp_init_16bpp.c
> @@ -52,6 +52,7 @@ decl_ipred_fns(dc,      16, mmxext, sse2);
>  decl_ipred_fns(dc_top,  16, mmxext, sse2);
>  decl_ipred_fns(dc_left, 16, mmxext, sse2);
>  decl_ipred_fn(dl,       16,     16, avx2);
> +decl_ipred_fn(dl,       32,     16, avx2);
>  
>  #define decl_ipred_dir_funcs(type) \
>  decl_ipred_fns(type, 16, sse2,  sse2); \
> @@ -135,6 +136,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
>          init_fpel_func(1, 1,  64, avg, _16, avx2);
>          init_fpel_func(0, 1, 128, avg, _16, avx2);
>          init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
> +        init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2);
>      }
>  
>  #endif /* HAVE_YASM */
> diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm
> index 212e413..5cd6a3e 100644
> --- a/libavcodec/x86/vp9intrapred_16bpp.asm
> +++ b/libavcodec/x86/vp9intrapred_16bpp.asm
> @@ -861,6 +861,7 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
>      DEFINE_ARGS dst, stride, stride3, cnt
>      mov                   cntd, 2
>      lea               stride3q, [strideq*3]
> +    

Trailing whitespaces.

>  .loop:
>      mova      [dstq+strideq*0], m0
>      vpalignr                m3, m2, m0, 2
> @@ -884,6 +885,68 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
>      dec                   cntd
>      jg .loop
>      RET
> +    

Same.

> +cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a
> +    movifnidn               aq, amp
> +    mova                    m0, [aq+mmsize*0+ 0]       ; abcdefghijklmnop
> +    mova                    m1, [aq+mmsize*1+ 0]       ; qrstuvwxyz012345
> +    vpbroadcastw           xm4, [aq+mmsize*1+30]       ; 55555555
> +    vperm2i128              m5, m0, m1, q0201          ; ijklmnopqrstuvwx
> +    vpalignr                m2, m5, m0, 2              ; bcdefghijklmnopq
> +    vpalignr                m3, m5, m0, 4              ; cdefghijklmnopqr
> +    LOWPASS                  0,  2,  3                 ; BCDEFGHIJKLMNOPQ
> +    vperm2i128              m5, m1, m4, q0201          ; yz01234555555555
> +    vpalignr                m2, m5, m1, 2              ; rstuvwxyz0123455
> +    vpalignr                m3, m5, m1, 4              ; stuvwxyz01234555
> +    LOWPASS                  1,  2,  3                 ; RSTUVWXYZ......5
> +    vperm2i128              m2, m1, m4, q0201          ; Z......555555555
> +    vperm2i128              m5, m0, m1, q0201          ; JKLMNOPQRSTUVWXY
> +    DEFINE_ARGS dst, stride, stride3, cnt
> +    lea               stride3q, [strideq*3]
> +    mov                   cntd, 4
> +    

Same.

Ronald can fix them before pushing (I think the git hooks would prevent
him to push this with them anyway), so no need to resend a fixed patch.
Just keep it in mind for future patchsets. Same with tabs on files other
than Makefile stuff.

> +.loop:
> +    mova   [dstq+strideq*0 + 0], m0
> +    mova   [dstq+strideq*0 +32], m1
> +    vpalignr                 m3, m5, m0, 2
> +    vpalignr                 m4, m2, m1, 2
> +    mova   [dstq+strideq*1 + 0], m3
> +    mova   [dstq+strideq*1 +32], m4
> +    vpalignr                 m3, m5, m0, 4
> +    vpalignr                 m4, m2, m1, 4
> +    mova   [dstq+strideq*2 + 0], m3
> +    mova   [dstq+strideq*2 +32], m4
> +    vpalignr                 m3, m5, m0, 6
> +    vpalignr                 m4, m2, m1, 6
> +    mova   [dstq+stride3q*1+ 0], m3
> +    mova   [dstq+stride3q*1+32], m4
> +    lea                    dstq, [dstq+strideq*4]
> +    vpalignr                 m3, m5, m0, 8
> +    vpalignr                 m4, m2, m1, 8
> +    mova   [dstq+strideq*0 + 0], m3
> +    mova   [dstq+strideq*0 +32], m4
> +    vpalignr                 m3, m5, m0, 10
> +    vpalignr                 m4, m2, m1, 10
> +    mova   [dstq+strideq*1 + 0], m3
> +    mova   [dstq+strideq*1 +32], m4
> +    vpalignr                 m3, m5, m0, 12
> +    vpalignr                 m4, m2, m1, 12
> +    mova   [dstq+strideq*2+ 0], m3
> +    mova   [dstq+strideq*2+32], m4
> +    vpalignr                 m3, m5, m0, 14
> +    vpalignr                 m4, m2, m1, 14
> +    mova   [dstq+stride3q+  0], m3
> +    mova   [dstq+stride3q+ 32], m4
> +    vpalignr                 m3, m5, m0, 16
> +    vpalignr                 m4, m2, m1, 16
> +    vperm2i128               m5, m3, m4, q0201
> +    vperm2i128               m2, m4, m4, q0101
> +    mova                     m0, m3
> +    mova                     m1, m4
> +    lea                    dstq, [dstq+strideq*4]
> +    dec                    cntd
> +    jg .loop
> +    RET
>  %endif
>  
>  %macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function
>
Ronald S. Bultje June 6, 2017, 12:13 p.m.
Hi,

On Mon, Jun 5, 2017 at 1:41 PM, James Almer <jamrial@gmail.com> wrote:

> On 6/4/2017 2:52 PM, Ilia Valiakhmetov wrote:
> > vp9_diag_downleft_32x32_8bpp_c: 580.2
> > vp9_diag_downleft_32x32_8bpp_sse2: 75.6
> > vp9_diag_downleft_32x32_8bpp_ssse3: 73.7
> > vp9_diag_downleft_32x32_8bpp_avx: 72.7
> > vp9_diag_downleft_32x32_10bpp_c: 1101.2
> > vp9_diag_downleft_32x32_10bpp_sse2: 145.4
> > vp9_diag_downleft_32x32_10bpp_ssse3: 137.5
> > vp9_diag_downleft_32x32_10bpp_avx: 134.8
> > vp9_diag_downleft_32x32_10bpp_avx2: 94.0
> > vp9_diag_downleft_32x32_12bpp_c: 1108.5
> > vp9_diag_downleft_32x32_12bpp_sse2: 145.5
> > vp9_diag_downleft_32x32_12bpp_ssse3: 137.3
> > vp9_diag_downleft_32x32_12bpp_avx: 135.2
> > vp9_diag_downleft_32x32_12bpp_avx2: 94.0
> >
> > ~30% faster than avx implementation
>
> Nice.
>
> >
> > ---
> >  libavcodec/x86/vp9dsp_init_16bpp.c    |  2 ++
> >  libavcodec/x86/vp9intrapred_16bpp.asm | 63
> +++++++++++++++++++++++++++++++++++
> >  2 files changed, 65 insertions(+)
> >
> > diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c
> b/libavcodec/x86/vp9dsp_init_16bpp.c
> > index 4576ff1..d1b8fcd 100644
> > --- a/libavcodec/x86/vp9dsp_init_16bpp.c
> > +++ b/libavcodec/x86/vp9dsp_init_16bpp.c
> > @@ -52,6 +52,7 @@ decl_ipred_fns(dc,      16, mmxext, sse2);
> >  decl_ipred_fns(dc_top,  16, mmxext, sse2);
> >  decl_ipred_fns(dc_left, 16, mmxext, sse2);
> >  decl_ipred_fn(dl,       16,     16, avx2);
> > +decl_ipred_fn(dl,       32,     16, avx2);
> >
> >  #define decl_ipred_dir_funcs(type) \
> >  decl_ipred_fns(type, 16, sse2,  sse2); \
> > @@ -135,6 +136,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext
> *dsp)
> >          init_fpel_func(1, 1,  64, avg, _16, avx2);
> >          init_fpel_func(0, 1, 128, avg, _16, avx2);
> >          init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
> > +        init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2);
> >      }
> >
> >  #endif /* HAVE_YASM */
> > diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm
> b/libavcodec/x86/vp9intrapred_16bpp.asm
> > index 212e413..5cd6a3e 100644
> > --- a/libavcodec/x86/vp9intrapred_16bpp.asm
> > +++ b/libavcodec/x86/vp9intrapred_16bpp.asm
> > @@ -861,6 +861,7 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride,
> l, a
> >      DEFINE_ARGS dst, stride, stride3, cnt
> >      mov                   cntd, 2
> >      lea               stride3q, [strideq*3]
> > +
>
> Trailing whitespaces.
>
> >  .loop:
> >      mova      [dstq+strideq*0], m0
> >      vpalignr                m3, m2, m0, 2
> > @@ -884,6 +885,68 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst,
> stride, l, a
> >      dec                   cntd
> >      jg .loop
> >      RET
> > +
>
> Same.
>
> > +cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a
> > +    movifnidn               aq, amp
> > +    mova                    m0, [aq+mmsize*0+ 0]       ;
> abcdefghijklmnop
> > +    mova                    m1, [aq+mmsize*1+ 0]       ;
> qrstuvwxyz012345
> > +    vpbroadcastw           xm4, [aq+mmsize*1+30]       ; 55555555
> > +    vperm2i128              m5, m0, m1, q0201          ;
> ijklmnopqrstuvwx
> > +    vpalignr                m2, m5, m0, 2              ;
> bcdefghijklmnopq
> > +    vpalignr                m3, m5, m0, 4              ;
> cdefghijklmnopqr
> > +    LOWPASS                  0,  2,  3                 ;
> BCDEFGHIJKLMNOPQ
> > +    vperm2i128              m5, m1, m4, q0201          ;
> yz01234555555555
> > +    vpalignr                m2, m5, m1, 2              ;
> rstuvwxyz0123455
> > +    vpalignr                m3, m5, m1, 4              ;
> stuvwxyz01234555
> > +    LOWPASS                  1,  2,  3                 ;
> RSTUVWXYZ......5
> > +    vperm2i128              m2, m1, m4, q0201          ;
> Z......555555555
> > +    vperm2i128              m5, m0, m1, q0201          ;
> JKLMNOPQRSTUVWXY
> > +    DEFINE_ARGS dst, stride, stride3, cnt
> > +    lea               stride3q, [strideq*3]
> > +    mov                   cntd, 4
> > +
>
> Same.
>
> Ronald can fix them before pushing (I think the git hooks would prevent
> him to push this with them anyway), so no need to resend a fixed patch.
> Just keep it in mind for future patchsets. Same with tabs on files other
> than Makefile stuff.


Pushed with that fixed.

Ronald

Patch hide | download patch | download mbox

diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c
index 4576ff1..d1b8fcd 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -52,6 +52,7 @@  decl_ipred_fns(dc,      16, mmxext, sse2);
 decl_ipred_fns(dc_top,  16, mmxext, sse2);
 decl_ipred_fns(dc_left, 16, mmxext, sse2);
 decl_ipred_fn(dl,       16,     16, avx2);
+decl_ipred_fn(dl,       32,     16, avx2);
 
 #define decl_ipred_dir_funcs(type) \
 decl_ipred_fns(type, 16, sse2,  sse2); \
@@ -135,6 +136,7 @@  av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
         init_fpel_func(1, 1,  64, avg, _16, avx2);
         init_fpel_func(0, 1, 128, avg, _16, avx2);
         init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
+        init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2);
     }
 
 #endif /* HAVE_YASM */
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm
index 212e413..5cd6a3e 100644
--- a/libavcodec/x86/vp9intrapred_16bpp.asm
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -861,6 +861,7 @@  cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
     DEFINE_ARGS dst, stride, stride3, cnt
     mov                   cntd, 2
     lea               stride3q, [strideq*3]
+    
 .loop:
     mova      [dstq+strideq*0], m0
     vpalignr                m3, m2, m0, 2
@@ -884,6 +885,68 @@  cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
     dec                   cntd
     jg .loop
     RET
+    
+cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq+mmsize*0+ 0]       ; abcdefghijklmnop
+    mova                    m1, [aq+mmsize*1+ 0]       ; qrstuvwxyz012345
+    vpbroadcastw           xm4, [aq+mmsize*1+30]       ; 55555555
+    vperm2i128              m5, m0, m1, q0201          ; ijklmnopqrstuvwx
+    vpalignr                m2, m5, m0, 2              ; bcdefghijklmnopq
+    vpalignr                m3, m5, m0, 4              ; cdefghijklmnopqr
+    LOWPASS                  0,  2,  3                 ; BCDEFGHIJKLMNOPQ
+    vperm2i128              m5, m1, m4, q0201          ; yz01234555555555
+    vpalignr                m2, m5, m1, 2              ; rstuvwxyz0123455
+    vpalignr                m3, m5, m1, 4              ; stuvwxyz01234555
+    LOWPASS                  1,  2,  3                 ; RSTUVWXYZ......5
+    vperm2i128              m2, m1, m4, q0201          ; Z......555555555
+    vperm2i128              m5, m0, m1, q0201          ; JKLMNOPQRSTUVWXY
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+    
+.loop:
+    mova   [dstq+strideq*0 + 0], m0
+    mova   [dstq+strideq*0 +32], m1
+    vpalignr                 m3, m5, m0, 2
+    vpalignr                 m4, m2, m1, 2
+    mova   [dstq+strideq*1 + 0], m3
+    mova   [dstq+strideq*1 +32], m4
+    vpalignr                 m3, m5, m0, 4
+    vpalignr                 m4, m2, m1, 4
+    mova   [dstq+strideq*2 + 0], m3
+    mova   [dstq+strideq*2 +32], m4
+    vpalignr                 m3, m5, m0, 6
+    vpalignr                 m4, m2, m1, 6
+    mova   [dstq+stride3q*1+ 0], m3
+    mova   [dstq+stride3q*1+32], m4
+    lea                    dstq, [dstq+strideq*4]
+    vpalignr                 m3, m5, m0, 8
+    vpalignr                 m4, m2, m1, 8
+    mova   [dstq+strideq*0 + 0], m3
+    mova   [dstq+strideq*0 +32], m4
+    vpalignr                 m3, m5, m0, 10
+    vpalignr                 m4, m2, m1, 10
+    mova   [dstq+strideq*1 + 0], m3
+    mova   [dstq+strideq*1 +32], m4
+    vpalignr                 m3, m5, m0, 12
+    vpalignr                 m4, m2, m1, 12
+    mova   [dstq+strideq*2+ 0], m3
+    mova   [dstq+strideq*2+32], m4
+    vpalignr                 m3, m5, m0, 14
+    vpalignr                 m4, m2, m1, 14
+    mova   [dstq+stride3q+  0], m3
+    mova   [dstq+stride3q+ 32], m4
+    vpalignr                 m3, m5, m0, 16
+    vpalignr                 m4, m2, m1, 16
+    vperm2i128               m5, m3, m4, q0201
+    vperm2i128               m2, m4, m4, q0101
+    mova                     m0, m3
+    mova                     m1, m4
+    lea                    dstq, [dstq+strideq*4]
+    dec                    cntd
+    jg .loop
+    RET
 %endif
 
 %macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function