@@ -901,49 +901,68 @@ cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a
LOWPASS 1, 2, 3 ; RSTUVWXYZ......5
vperm2i128 m2, m1, m4, q0201 ; Z......555555555
vperm2i128 m5, m0, m1, q0201 ; JKLMNOPQRSTUVWXY
- DEFINE_ARGS dst, stride, stride3, cnt
+ vperm2i128 m6, m2, m2, q0101
+ DEFINE_ARGS dst, stride, stride3, dst16, cnt
lea stride3q, [strideq*3]
- mov cntd, 4
+ lea dst16q, [dstq+strideq*8]
+ lea dst16q, [dst16q+strideq*8]
+ mov cntd, 2
.loop:
mova [dstq+strideq*0 + 0], m0
mova [dstq+strideq*0 +32], m1
+ mova [dst16q+strideq*0+ 0], m1
+ mova [dst16q+strideq*0+32], m6
vpalignr m3, m5, m0, 2
vpalignr m4, m2, m1, 2
mova [dstq+strideq*1 + 0], m3
mova [dstq+strideq*1 +32], m4
+ mova [dst16q+strideq*1 +0], m4
+ mova [dst16q+strideq*1 +32], m6
vpalignr m3, m5, m0, 4
vpalignr m4, m2, m1, 4
mova [dstq+strideq*2 + 0], m3
mova [dstq+strideq*2 +32], m4
+ mova [dst16q+strideq*2+0], m4
+ mova [dst16q+strideq*2+32], m6
vpalignr m3, m5, m0, 6
- vpalignr m4, m2, m1, 6
+ vpalignr m4, m2, m1, 6
mova [dstq+stride3q*1+ 0], m3
mova [dstq+stride3q*1+32], m4
- lea dstq, [dstq+strideq*4]
+ mova [dst16q+stride3q*1+0], m4
+ mova [dst16q+stride3q*1+32], m6
vpalignr m3, m5, m0, 8
vpalignr m4, m2, m1, 8
+ lea dstq, [dstq+strideq*4]
+ lea dst16q, [dst16q+strideq*4]
mova [dstq+strideq*0 + 0], m3
mova [dstq+strideq*0 +32], m4
+ mova [dst16q+strideq*0 +0], m4
+ mova [dst16q+strideq*0 +32], m6
vpalignr m3, m5, m0, 10
vpalignr m4, m2, m1, 10
mova [dstq+strideq*1 + 0], m3
mova [dstq+strideq*1 +32], m4
+ mova [dst16q+strideq*1 +0], m4
+ mova [dst16q+strideq*1 +32], m6
vpalignr m3, m5, m0, 12
vpalignr m4, m2, m1, 12
- mova [dstq+strideq*2+ 0], m3
- mova [dstq+strideq*2+32], m4
+ mova [dstq+strideq*2+ 0], m3
+ mova [dstq+strideq*2+32], m4
+ mova [dst16q+strideq*2+0], m4
+ mova [dst16q+strideq*2+32], m6
vpalignr m3, m5, m0, 14
vpalignr m4, m2, m1, 14
- mova [dstq+stride3q+ 0], m3
- mova [dstq+stride3q+ 32], m4
- vpalignr m3, m5, m0, 16
- vpalignr m4, m2, m1, 16
- vperm2i128 m5, m3, m4, q0201
- vperm2i128 m2, m4, m4, q0101
- mova m0, m3
- mova m1, m4
+ mova [dstq+stride3q+ 0], m3
+ mova [dstq+stride3q+ 32], m4
+ mova [dst16q+stride3q+ 0], m4
+ mova [dst16q+stride3q+32], m6
+ mova m0, m5
+ mova m1, m2
+ vperm2i128 m5, m5, m2, q0201
+ mova m2, m6
lea dstq, [dstq+strideq*4]
+ lea dst16q, [dst16q+strideq*4]
dec cntd
jg .loop
RET
Use symmetry properties of the ipred_dl function for better performance. vp9_diag_downleft_32x32_12bpp_c: 1534.2 vp9_diag_downleft_32x32_12bpp_sse2: 145.9 vp9_diag_downleft_32x32_12bpp_ssse3: 140.0 vp9_diag_downleft_32x32_12bpp_avx: 134.8 vp9_diag_downleft_32x32_12bpp_avx2: 78.9 ~40% faster than avx Signed-off-by: Ilia Valiakhmetov <zakne0ne@gmail.com> --- libavcodec/x86/vp9intrapred_16bpp.asm | 47 ++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 14 deletions(-)