From cf4dc8fcd974a845b91aaa8685c06fa145b01786 Mon Sep 17 00:00:00 2001
From: Ivan Kalvachev <ikalvachev@gmail.com>
Date: Sat, 5 Aug 2017 20:18:50 +0300
Subject: [PATCH 1/6] Add macros to x86util.asm .
Improved version of VBROADCASTSS that works like the avx2 instruction.
Emulation of vpbroadcastd.
Horizontal sum HSUMPS that places the result in all elements.
Emulation of blendvps and pblendvb.
Signed-off-by: Ivan Kalvachev <ikalvachev@gmail.com>
---
libavutil/x86/x86util.asm | 108 ++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 100 insertions(+), 8 deletions(-)
@@ -832,14 +832,25 @@
pmaxsd %1, %2
%endmacro
-%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32
-%if cpuflag(avx)
- vbroadcastss %1, %2
-%else ; sse
-%ifnidn %1, %2
- movss %1, %2
-%endif
- shufps %1, %1, 0
+%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32/xmm
+%if cpuflag(avx2)
+ vbroadcastss %1, %2 ; ymm, xmm
+%elif cpuflag(avx)
+ %ifnum sizeof%2 ; avx1 register
+ vpermilps xmm%1, xmm%2, q0000 ; xmm, xmm, imm || ymm, ymm, imm
+ %if sizeof%1 >= 32 ; mmsize>=32
+ vinsertf128 %1, %1, xmm%1, 1 ; ymm, ymm, xmm, im
+ %endif
+ %else ; avx1 memory
+ vbroadcastss %1, %2 ; ymm, mm32 || xmm, m32
+ %endif
+%else
+ %ifnum sizeof%2 ; sse register
+ shufps %1, %2, %2, q0000
+ %else ; sse memory
+ movss %1, %2
+ shufps %1, %1, 0
+ %endif
%endif
%endmacro
@@ -854,6 +865,21 @@
%endif
%endmacro
+%macro VPBROADCASTD 2 ; dst xmm/ymm, src m32/xmm
+%if cpuflag(avx2)
+ vpbroadcastd %1, %2
+%elif cpuflag(avx) && sizeof%1 >= 32
+ %error vpbroadcastd not possible with ymm on avx1. try vbroadcastss
+%else
+ %ifnum sizeof%2 ; sse2 register
+ pshufd %1, %2, q0000
+ %else ; sse memory
+ movd %1, %2
+ pshufd %1, %1, 0
+ %endif
+%endif
+%endmacro
+
%macro SHUFFLE_MASK_W 8
%rep 8
%if %1>=0x80
@@ -918,3 +944,69 @@
movhlps %1, %2 ; may cause an int/float domain transition and has a dependency on dst
%endif
%endmacro
+
+; Horizontal Sum of Packed Single precision floats
+; The resulting sum is in all elements.
+%macro HSUMPS 2 ; dst/src, tmp
+%if cpuflag(avx)
+ %if sizeof%1>=32 ; avx
+ vperm2f128 %2, %1, %1, (0)*16+(1)
+ addps %1, %2
+ %endif
+ shufps %2, %1, %1, q1032
+ addps %1, %2
+ shufps %2, %1, %1, q0321
+ addps %1, %2
+%else ; this form is a bit faster than the short avx-like emulation.
+ movaps %2, %1
+ shufps %1, %1, q1032
+ addps %1, %2
+ movaps %2, %1
+ shufps %1, %1, q0321
+ addps %1, %2
+ ; all %1 members should be equal for as long as float a+b==b+a
+%endif
+%endmacro
+
+; Emulate blendvps if not available
+;
+; src_b is destroyed when using emulation with logical operands
+; SSE41 blendv instruction is hard coded to use xmm0 as mask
+%macro BLENDVPS 3 ; dst/src_a, src_b, mask
+%if cpuflag(avx)
+ blendvps %1, %1, %2, %3
+%elif cpuflag(sse4)
+ %if notcpuflag(avx)
+ %ifnidn %3,xmm0
+ %error sse41 blendvps uses xmm0 as default 3d operand, you used %3
+ %endif
+ %endif
+ blendvps %1, %2, %3
+%else
+ xorps %2, %1
+ andps %2, %3
+ xorps %1, %2
+%endif
+%endmacro
+
+; Emulate pblendvb if not available
+;
+; src_b is destroyed when using emulation with logical operands
+; SSE41 blendv instruction is hard coded to use xmm0 as mask
+%macro PBLENDVB 3 ; dst/src_a, src_b, mask
+%if cpuflag(avx)
+ %if cpuflag(avx) && notcpuflag(avx2) && sizeof%1 >= 32
+ %error pblendb not possible with ymm on avx1, try blendvps.
+ %endif
+ pblendvb %1, %1, %2, %3
+%elif cpuflag(sse4)
+ %ifnidn %3,xmm0
+ %error sse41 pblendvd uses xmm0 as default 3d operand, you used %3
+ %endif
+ pblendvb %1, %2, %3
+%else
+ pxor %2, %1
+ pand %2, %3
+ pxor %1, %2
+%endif
+%endmacro
--
2.13.2