diff mbox

[FFmpeg-devel] avcodec/vp9: AVX2 ipred_vl_16x16

Message ID 20170703110907.5980-1-zakne0ne@gmail.com
State New
Headers show

Commit Message

Ilia July 3, 2017, 11:09 a.m. UTC
vp9_vert_left_16x16_12bpp_c: 273.8
vp9_vert_left_16x16_12bpp_sse2: 69.4
vp9_vert_left_16x16_12bpp_ssse3: 35.3
vp9_vert_left_16x16_12bpp_avx: 34.6
vp9_vert_left_16x16_12bpp_avx2: 22.4

~35% faster than avx

Signed-off-by: Ilia Valiakhmetov <zakne0ne@gmail.com>
---
 libavcodec/x86/vp9dsp_init_16bpp.c    |  2 ++
 libavcodec/x86/vp9intrapred_16bpp.asm | 53 +++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+)
diff mbox

Patch

diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c
index 60d10a1..da8b74c 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -55,6 +55,7 @@  decl_ipred_fn(dl,       16,     16, avx2);
 decl_ipred_fn(dl,       32,     16, avx2);
 decl_ipred_fn(dr,       16,     16, avx2);
 decl_ipred_fn(dr,       32,     16, avx2);
+decl_ipred_fn(vl,       16,     16, avx2);
 
 #define decl_ipred_dir_funcs(type) \
 decl_ipred_fns(type, 16, sse2,  sse2); \
@@ -143,6 +144,7 @@  av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
 #if ARCH_X86_64
         init_ipred_func(dr, DIAG_DOWN_RIGHT, 32, 16, avx2);
 #endif
+        init_ipred_func(vl, VERT_LEFT, 16, 16, avx2);
     }
 
 #endif /* HAVE_X86ASM */
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm
index 32b6982..8d8d65e 100644
--- a/libavcodec/x86/vp9intrapred_16bpp.asm
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -1538,6 +1538,59 @@  VL_FUNCS 1
 INIT_XMM avx
 VL_FUNCS 1
 
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_vl_16x16_16, 2, 4, 6, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]                ; abcdefghijklmnop
+    vpbroadcastw           xm5, [aq+30]             ; pppppppp
+    vperm2i128              m1, m0, m5, q0201       ; ijklmnoppppppppp
+    vpalignr                m2, m1, m0, 2           ; bcdefghijklmnopp
+    vpalignr                m3, m1, m0, 4           ; cdefghijklmnoppp
+    mova                    m4, m2
+    pavgw                   m4, m0
+    LOWPASS                  0,  2,  3              ; BCDEFGHIJKLMNOPp
+    vperm2i128              m2, m0, m5, q0201
+    vperm2i128              m3, m4, m5, q0201
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    mova      [dstq+strideq*0], m4
+    mova      [dstq+strideq*1], m0
+    vpalignr                m1, m2, m0, 2
+    vpalignr                m5, m3, m4, 2
+    mova      [dstq+strideq*2], m5
+    mova      [dstq+stride3q ], m1
+    vpalignr                m1, m2, m0, 4
+    vpalignr                m5, m3, m4, 4
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m5
+    mova      [dstq+strideq*1], m1
+    vpalignr                m1, m2, m0, 6
+    vpalignr                m5, m3, m4, 6
+    mova      [dstq+strideq*2], m5
+    mova      [dstq+stride3q ], m1
+    vpalignr                m1, m2, m0, 8
+    vpalignr                m5, m3, m4, 8
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m5
+    mova      [dstq+strideq*1], m1
+    vpalignr                m1, m2, m0, 10
+    vpalignr                m5, m3, m4, 10
+    mova      [dstq+strideq*2], m5
+    mova      [dstq+stride3q ], m1
+    vpalignr                m1, m2, m0, 12
+    vpalignr                m5, m3, m4, 12
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m5
+    mova      [dstq+strideq*1], m1
+    vpalignr                m1, m2, m0, 14
+    vpalignr                m5, m3, m4, 14
+    mova      [dstq+strideq*2], m5
+    mova      [dstq+stride3q ], m1
+    RET
+%endif
+
 %macro VR_FUNCS 0
 cglobal vp9_ipred_vr_4x4_16, 4, 4, 3, dst, stride, l, a
     movu                    m0, [aq-2]