@@ -52,7 +52,7 @@ decl_ipred_fns(dc, 16, mmxext, sse2);
decl_ipred_fns(dc_top, 16, mmxext, sse2);
decl_ipred_fns(dc_left, 16, mmxext, sse2);
decl_ipred_fn(dl, 16, 16, avx2);
-decl_ipred_fn(dl, 32, 32, avx2);
+decl_ipred_fn(dl, 32, 16, avx2);
#define decl_ipred_dir_funcs(type) \
decl_ipred_fns(type, 16, sse2, sse2); \
@@ -136,7 +136,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
init_fpel_func(1, 1, 64, avg, _16, avx2);
init_fpel_func(0, 1, 128, avg, _16, avx2);
init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
- init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 32, avx2);
+ init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2);
}
#endif /* HAVE_YASM */
@@ -861,6 +861,7 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
DEFINE_ARGS dst, stride, stride3, cnt
mov cntd, 2
lea stride3q, [strideq*3]
+
.loop:
mova [dstq+strideq*0], m0
vpalignr m3, m2, m0, 2
@@ -887,24 +888,64 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a
movifnidn aq, amp
- mova m0, [aq+mmsize*0] ; abcdefghijklmnop
- mova m1, [aq+mmsize*1] ; qrstuvwxyz012345
- vpbroadcastw xm4, [aq+mmsize*1+30] ; 55555555
- vpalignr m2, m1, m0, 2 ; bcdefghijklmnopq
- vpalignr m3, m1, m0, 4 ; cdefghijklmnopqr
- vperm2i128 m5, m1, m4, q0201 ; yz01234555555555
- LOWPASS 0, 2, 3 ; BCDEFGHIJKLMNOPQ
- vpalignr m2, m5, m1, 2 ; rstuvwxyz0123455
- vpalignr m3, m5, m1, 4 ; stuvwxyz01234555
- LOWPASS 1, 2, 3 ; RSTUVWXYZ......5
- vperm2i128 m2, m1, m4, q0201 ; Z......555555555
+ mova m0, [aq+mmsize*0+ 0] ; abcdefghijklmnop
+ mova m1, [aq+mmsize*1+ 0] ; qrstuvwxyz012345
+ vpbroadcastw xm4, [aq+mmsize*1+30] ; 55555555
+ vperm2i128 m5, m0, m1, q0201 ; ijklmnopqrstuvwx
+ vpalignr m2, m5, m0, 2 ; bcdefghijklmnopq
+ vpalignr m3, m5, m0, 4 ; cdefghijklmnopqr
+ LOWPASS 0, 2, 3 ; BCDEFGHIJKLMNOPQ
+ vperm2i128 m5, m1, m4, q0201 ; yz01234555555555
+ vpalignr m2, m5, m1, 2 ; rstuvwxyz0123455
+ vpalignr m3, m5, m1, 4 ; stuvwxyz01234555
+ LOWPASS 1, 2, 3 ; RSTUVWXYZ......5
+ vperm2i128 m2, m1, m4, q0201 ; Z......555555555
+ vperm2i128 m5, m0, m1, q0201 ; JKLMNOPQRSTUVWXY
+ DEFINE_ARGS dst, stride, stride3, stride5, cnt
+ lea stride3q, [strideq*3]
+ lea stride5q, [strideq*5]
+ mov cntd, 4
- mova [dstq+strideq*0+0 ], m0
- mova [dstq+strideq*0+32], m1
- vpalignr m3, m1, m0, 2
- vpalignr m4, m2, m1, 2
- mova [dstq+strideq*1+0 ], m3
- mova [dstq+strideq*1+32], m4
+.loop:
+ mova [dstq+strideq*0 + 0], m0
+ mova [dstq+strideq*0 +32], m1
+ vpalignr m3, m5, m0, 2
+ vpalignr m4, m2, m1, 2
+ mova [dstq+strideq*1 + 0], m3
+ mova [dstq+strideq*1 +32], m4
+ vpalignr m3, m5, m0, 4
+ vpalignr m4, m2, m1, 4
+ mova [dstq+strideq*2 + 0], m3
+ mova [dstq+strideq*2 +32], m4
+ vpalignr m3, m5, m0, 6
+ vpalignr m4, m2, m1, 6
+ mova [dstq+stride3q*1+ 0], m3
+ mova [dstq+stride3q*1+32], m4
+ vpalignr m3, m5, m0, 8
+ vpalignr m4, m2, m1, 8
+ mova [dstq+strideq*4 + 0], m3
+ mova [dstq+strideq*4 +32], m4
+ vpalignr m3, m5, m0, 10
+ vpalignr m4, m2, m1, 10
+ mova [dstq+stride5q*1+ 0], m3
+ mova [dstq+stride5q*1+32], m4
+ vpalignr m3, m5, m0, 12
+ vpalignr m4, m2, m1, 12
+ mova [dstq+stride3q*2+ 0], m3
+ mova [dstq+stride3q*2+32], m4
+ vpalignr m3, m5, m0, 14
+ vpalignr m4, m2, m1, 14
+ mova [dstq+stride3q*2+64], m3
+ mova [dstq+stride3q*2+96], m4
+ vpalignr m3, m5, m0, 16
+ vpalignr m4, m2, m1, 16
+ vperm2i128 m5, m3, m4, q0201
+ vperm2i128 m2, m4, m4, q0101
+ mova m0, m3
+ mova m1, m4
+ lea dstq, [dstq+strideq*8]
+ dec cntd
+ jg .loop
RET
%endif