@@ -55,6 +55,7 @@ decl_ipred_fn(dl, 16, 16, avx2);
decl_ipred_fn(dl, 32, 16, avx2);
decl_ipred_fn(dr, 16, 16, avx2);
decl_ipred_fn(dr, 32, 16, avx2);
+decl_ipred_fn(vl, 16, 16, avx2);
#define decl_ipred_dir_funcs(type) \
decl_ipred_fns(type, 16, sse2, sse2); \
@@ -143,6 +144,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
#if ARCH_X86_64
init_ipred_func(dr, DIAG_DOWN_RIGHT, 32, 16, avx2);
#endif
+ init_ipred_func(vl, VERT_LEFT, 16, 16, avx2);
}
#endif /* HAVE_X86ASM */
@@ -1538,6 +1538,59 @@ VL_FUNCS 1
INIT_XMM avx
VL_FUNCS 1
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_vl_16x16_16, 2, 4, 6, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq] ; abcdefghijklmnop
+ vpbroadcastw xm5, [aq+30] ; pppppppp
+ vperm2i128 m1, m0, m5, q0201 ; ijklmnoppppppppp
+ vpalignr m2, m1, m0, 2 ; bcdefghijklmnopp
+ vpalignr m3, m1, m0, 4 ; cdefghijklmnoppp
+ mova m4, m2
+ pavgw m4, m0
+ LOWPASS 0, 2, 3 ; BCDEFGHIJKLMNOPp
+ vperm2i128 m2, m0, m5, q0201
+ vperm2i128 m3, m4, m5, q0201
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+
+ mova [dstq+strideq*0], m4
+ mova [dstq+strideq*1], m0
+ vpalignr m1, m2, m0, 2
+ vpalignr m5, m3, m4, 2
+ mova [dstq+strideq*2], m5
+ mova [dstq+stride3q ], m1
+ vpalignr m1, m2, m0, 4
+ vpalignr m5, m3, m4, 4
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], m5
+ mova [dstq+strideq*1], m1
+ vpalignr m1, m2, m0, 6
+ vpalignr m5, m3, m4, 6
+ mova [dstq+strideq*2], m5
+ mova [dstq+stride3q ], m1
+ vpalignr m1, m2, m0, 8
+ vpalignr m5, m3, m4, 8
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], m5
+ mova [dstq+strideq*1], m1
+ vpalignr m1, m2, m0, 10
+ vpalignr m5, m3, m4, 10
+ mova [dstq+strideq*2], m5
+ mova [dstq+stride3q ], m1
+ vpalignr m1, m2, m0, 12
+ vpalignr m5, m3, m4, 12
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], m5
+ mova [dstq+strideq*1], m1
+ vpalignr m1, m2, m0, 14
+ vpalignr m5, m3, m4, 14
+ mova [dstq+strideq*2], m5
+ mova [dstq+stride3q ], m1
+ RET
+%endif
+
%macro VR_FUNCS 0
cglobal vp9_ipred_vr_4x4_16, 4, 4, 3, dst, stride, l, a
movu m0, [aq-2]
vp9_vert_left_16x16_12bpp_c: 273.8 vp9_vert_left_16x16_12bpp_sse2: 69.4 vp9_vert_left_16x16_12bpp_ssse3: 35.3 vp9_vert_left_16x16_12bpp_avx: 34.6 vp9_vert_left_16x16_12bpp_avx2: 22.4 ~35% faster than avx Signed-off-by: Ilia Valiakhmetov <zakne0ne@gmail.com> --- libavcodec/x86/vp9dsp_init_16bpp.c | 2 ++ libavcodec/x86/vp9intrapred_16bpp.asm | 53 +++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+)