diff mbox

[FFmpeg-devel] libavcodec/vp9 ipred_dl_32x32_16 avx2 version

Message ID 20170604140907.4692-1-zakne0ne@gmail.com
State New
Headers show

Commit Message

Ilia June 4, 2017, 2:09 p.m. UTC
vp9_diag_downleft_32x32_8bpp_c: 580.2
vp9_diag_downleft_32x32_8bpp_sse2: 75.6
vp9_diag_downleft_32x32_8bpp_ssse3: 73.7
vp9_diag_downleft_32x32_8bpp_avx: 72.7
vp9_diag_downleft_32x32_10bpp_c: 1101.2
vp9_diag_downleft_32x32_10bpp_sse2: 145.4
vp9_diag_downleft_32x32_10bpp_ssse3: 137.5
vp9_diag_downleft_32x32_10bpp_avx: 134.8
vp9_diag_downleft_32x32_10bpp_avx2: 94.0
vp9_diag_downleft_32x32_12bpp_c: 1108.5
vp9_diag_downleft_32x32_12bpp_sse2: 145.5
vp9_diag_downleft_32x32_12bpp_ssse3: 137.3
vp9_diag_downleft_32x32_12bpp_avx: 135.2
vp9_diag_downleft_32x32_12bpp_avx2: 94.0

~30% faster than avx

---
 libavcodec/x86/vp9dsp_init_16bpp.c  | 4 +-
 libavcodec/x86/vp9intrapred_16bpp.asm | 75 +++++++++++++++++++++++++++--------
 2 files changed, 60 insertions(+), 19 deletions(-)
diff mbox

Patch

diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c
index 4e1f24f..d1b8fcd 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -52,7 +52,7 @@ decl_ipred_fns(dc,   16, mmxext, sse2);
 decl_ipred_fns(dc_top, 16, mmxext, sse2);
 decl_ipred_fns(dc_left, 16, mmxext, sse2);
 decl_ipred_fn(dl,    16,   16, avx2);
-decl_ipred_fn(dl,    32,   32, avx2);
+decl_ipred_fn(dl,    32,   16, avx2);
 
 #define decl_ipred_dir_funcs(type) \
 decl_ipred_fns(type, 16, sse2, sse2); \
@@ -136,7 +136,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
     init_fpel_func(1, 1, 64, avg, _16, avx2);
     init_fpel_func(0, 1, 128, avg, _16, avx2);
     init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
-    init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 32, avx2);
+    init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2);
   }
 
 #endif /* HAVE_YASM */
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm
index 2ec5381..10a0994 100644
--- a/libavcodec/x86/vp9intrapred_16bpp.asm
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -861,6 +861,7 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
   DEFINE_ARGS dst, stride, stride3, cnt
   mov          cntd, 2
   lea        stride3q, [strideq*3]
+  
 .loop:
   mova   [dstq+strideq*0], m0
   vpalignr        m3, m2, m0, 2
@@ -887,24 +888,64 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
   
 cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a
   movifnidn        aq, amp
-  mova          m0, [aq+mmsize*0]    ; abcdefghijklmnop
-  mova          m1, [aq+mmsize*1]    ; qrstuvwxyz012345
-  vpbroadcastw      xm4, [aq+mmsize*1+30]  ; 55555555
-  vpalignr        m2, m1, m0, 2      ; bcdefghijklmnopq
-  vpalignr        m3, m1, m0, 4      ; cdefghijklmnopqr
-  vperm2i128       m5, m1, m4, q0201    ; yz01234555555555
-  LOWPASS         0, 2, 3       ; BCDEFGHIJKLMNOPQ
-  vpalignr        m2, m5, m1, 2      ; rstuvwxyz0123455
-  vpalignr        m3, m5, m1, 4      ; stuvwxyz01234555
-  LOWPASS         1, 2, 3       ; RSTUVWXYZ......5
-  vperm2i128       m2, m1, m4, q0201    ; Z......555555555
+  mova          m0, [aq+mmsize*0+ 0]    ; abcdefghijklmnop
+  mova          m1, [aq+mmsize*1+ 0]    ; qrstuvwxyz012345
+  vpbroadcastw      xm4, [aq+mmsize*1+30]    ; 55555555
+  vperm2i128       m5, m0, m1, q0201     ; ijklmnopqrstuvwx
+  vpalignr        m2, m5, m0, 2       ; bcdefghijklmnopq
+  vpalignr        m3, m5, m0, 4       ; cdefghijklmnopqr
+  LOWPASS         0, 2, 3         ; BCDEFGHIJKLMNOPQ
+  vperm2i128       m5, m1, m4, q0201     ; yz01234555555555
+  vpalignr        m2, m5, m1, 2       ; rstuvwxyz0123455
+  vpalignr        m3, m5, m1, 4       ; stuvwxyz01234555
+  LOWPASS         1, 2, 3         ; RSTUVWXYZ......5
+  vperm2i128       m2, m1, m4, q0201     ; Z......555555555
+  vperm2i128       m5, m0, m1, q0201     ; JKLMNOPQRSTUVWXY
+  DEFINE_ARGS dst, stride, stride3, stride5, cnt
+  lea        stride3q, [strideq*3]
+  lea        stride5q, [strideq*5]
+  mov          cntd, 4
   
-  mova  [dstq+strideq*0+0 ], m0
-  mova  [dstq+strideq*0+32], m1
-  vpalignr        m3, m1, m0, 2
-  vpalignr        m4, m2, m1, 2
-  mova  [dstq+strideq*1+0 ], m3
-  mova  [dstq+strideq*1+32], m4
+.loop:
+  mova  [dstq+strideq*0 + 0], m0
+  mova  [dstq+strideq*0 +32], m1
+  vpalignr         m3, m5, m0, 2
+  vpalignr         m4, m2, m1, 2
+  mova  [dstq+strideq*1 + 0], m3
+  mova  [dstq+strideq*1 +32], m4
+  vpalignr         m3, m5, m0, 4
+  vpalignr         m4, m2, m1, 4
+  mova  [dstq+strideq*2 + 0], m3
+  mova  [dstq+strideq*2 +32], m4
+  vpalignr         m3, m5, m0, 6
+  vpalignr         m4, m2, m1, 6
+  mova  [dstq+stride3q*1+ 0], m3
+  mova  [dstq+stride3q*1+32], m4
+  vpalignr         m3, m5, m0, 8
+  vpalignr         m4, m2, m1, 8
+  mova  [dstq+strideq*4 + 0], m3
+  mova  [dstq+strideq*4 +32], m4
+  vpalignr         m3, m5, m0, 10
+  vpalignr         m4, m2, m1, 10
+  mova  [dstq+stride5q*1+ 0], m3
+  mova  [dstq+stride5q*1+32], m4
+  vpalignr         m3, m5, m0, 12
+  vpalignr         m4, m2, m1, 12
+  mova  [dstq+stride3q*2+ 0], m3
+  mova  [dstq+stride3q*2+32], m4  
+  vpalignr         m3, m5, m0, 14
+  vpalignr         m4, m2, m1, 14
+  mova  [dstq+stride3q*2+64], m3
+  mova  [dstq+stride3q*2+96], m4
+  vpalignr         m3, m5, m0, 16
+  vpalignr         m4, m2, m1, 16
+  vperm2i128        m5, m3, m4, q0201
+  vperm2i128        m2, m4, m4, q0101
+  mova           m0, m3
+  mova           m1, m4
+  lea          dstq, [dstq+strideq*8]
+  dec          cntd
+  jg .loop
   RET
 %endif