[FFmpeg-devel] Add macros used in opus_pvq_search to x86util.asm

Submitted by Ivan Kalvachev on Aug. 5, 2017, 7:10 p.m.

Details

Message ID CABA=pqfT8Py0q4ZZziUrroBGsfchOzXDyL_s_uXuX8ULcWGLjA@mail.gmail.com
State New
Headers show

Commit Message

Ivan Kalvachev Aug. 5, 2017, 7:10 p.m.
Improved version of VBROADCASTSS that works like the avx2 instruction.
Emulation of vpbroadcastd.
Horizontal sum HSUMPS that places the result in all elements.
Emulation of blendvps and pblendvb.

Comments

Henrik Gramner Aug. 6, 2017, 11:12 a.m.
On Sat, Aug 5, 2017 at 9:10 PM, Ivan Kalvachev <ikalvachev@gmail.com> wrote:
> +%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32/xmm
> +%if cpuflag(avx2)
> +    vbroadcastss  %1, %2                    ; ymm, xmm
> +%elif cpuflag(avx)
> +    %ifnum sizeof%2         ; avx1 register
> +        vpermilps  xmm%1, xmm%2, q0000      ; xmm, xmm, imm || ymm, ymm, imm

Nit: Use shufps instead of vpermilps, it's one byte shorter but
otherwise identical in this case.

c5 e8 c6 ca 00    vshufps xmm1,xmm2,xmm2,0x0
c4 e3 79 04 ca 00 vpermilps xmm1,xmm2,0x0

> +%macro BLENDVPS 3 ; dst/src_a, src_b, mask
> +%if cpuflag(avx)
> +    blendvps  %1, %1, %2, %3
> +%elif cpuflag(sse4)
> +    %if notcpuflag(avx)
> +        %ifnidn %3,xmm0
> +            %error sse41 blendvps uses xmm0 as default 3d operand, you used %3
> +        %endif
> +    %endif

notcpuflag(avx) is redundant (it's always true since AVX uses the first branch).

Patch hide | download patch | download mbox

From cf4dc8fcd974a845b91aaa8685c06fa145b01786 Mon Sep 17 00:00:00 2001
From: Ivan Kalvachev <ikalvachev@gmail.com>
Date: Sat, 5 Aug 2017 20:18:50 +0300
Subject: [PATCH 1/6] Add macros to x86util.asm .

Improved version of VBROADCASTSS that works like the avx2 instruction.
Emulation of vpbroadcastd.
Horizontal sum HSUMPS that places the result in all elements.
Emulation of blendvps and pblendvb.

Signed-off-by: Ivan Kalvachev <ikalvachev@gmail.com>
---
 libavutil/x86/x86util.asm | 108 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 100 insertions(+), 8 deletions(-)

diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index cc7d272cad..d460ee5193 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -832,14 +832,25 @@ 
     pmaxsd  %1, %2
 %endmacro
 
-%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32
-%if cpuflag(avx)
-    vbroadcastss %1, %2
-%else ; sse
-%ifnidn %1, %2
-    movss        %1, %2
-%endif
-    shufps       %1, %1, 0
+%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32/xmm
+%if cpuflag(avx2)
+    vbroadcastss  %1, %2                    ; ymm, xmm
+%elif cpuflag(avx)
+    %ifnum sizeof%2         ; avx1 register
+        vpermilps  xmm%1, xmm%2, q0000      ; xmm, xmm, imm || ymm, ymm, imm
+        %if sizeof%1 >= 32  ; mmsize>=32
+            vinsertf128  %1, %1, xmm%1, 1   ; ymm, ymm, xmm, im
+        %endif
+    %else                   ; avx1 memory
+        vbroadcastss  %1, %2                ; ymm, mm32 || xmm, m32
+    %endif
+%else
+    %ifnum sizeof%2         ; sse register
+        shufps  %1, %2, %2, q0000
+    %else                   ; sse memory
+        movss   %1, %2
+        shufps  %1, %1, 0
+    %endif
 %endif
 %endmacro
 
@@ -854,6 +865,21 @@ 
 %endif
 %endmacro
 
+%macro VPBROADCASTD 2 ; dst xmm/ymm, src m32/xmm
+%if cpuflag(avx2)
+    vpbroadcastd  %1, %2
+%elif cpuflag(avx) && sizeof%1 >= 32
+    %error vpbroadcastd not possible with ymm on avx1. try vbroadcastss
+%else
+    %ifnum sizeof%2         ; sse2 register
+        pshufd  %1, %2, q0000
+    %else                   ; sse memory
+        movd    %1, %2
+        pshufd  %1, %1, 0
+    %endif
+%endif
+%endmacro
+
 %macro SHUFFLE_MASK_W 8
     %rep 8
         %if %1>=0x80
@@ -918,3 +944,69 @@ 
     movhlps    %1, %2        ; may cause an int/float domain transition and has a dependency on dst
 %endif
 %endmacro
+
+; Horizontal Sum of Packed Single precision floats
+; The resulting sum is in all elements.
+%macro HSUMPS 2 ; dst/src, tmp
+%if cpuflag(avx)
+    %if sizeof%1>=32  ; avx
+        vperm2f128  %2, %1, %1, (0)*16+(1)
+        addps       %1, %2
+    %endif
+    shufps      %2, %1, %1, q1032
+    addps       %1, %2
+    shufps      %2, %1, %1, q0321
+    addps       %1, %2
+%else  ; this form is a bit faster than the short avx-like emulation.
+    movaps      %2, %1
+    shufps      %1, %1, q1032
+    addps       %1, %2
+    movaps      %2, %1
+    shufps      %1, %1, q0321
+    addps       %1, %2
+    ; all %1 members should be equal for as long as float a+b==b+a
+%endif
+%endmacro
+
+; Emulate blendvps if not available
+;
+; src_b is destroyed when using emulation with logical operands
+; SSE41 blendv instruction is hard coded to use xmm0 as mask
+%macro BLENDVPS 3 ; dst/src_a, src_b, mask
+%if cpuflag(avx)
+    blendvps  %1, %1, %2, %3
+%elif cpuflag(sse4)
+    %if notcpuflag(avx)
+        %ifnidn %3,xmm0
+            %error sse41 blendvps uses xmm0 as default 3d operand, you used %3
+        %endif
+    %endif
+    blendvps  %1, %2, %3
+%else
+    xorps  %2, %1
+    andps  %2, %3
+    xorps  %1, %2
+%endif
+%endmacro
+
+; Emulate pblendvb if not available
+;
+; src_b is destroyed when using emulation with logical operands
+; SSE41 blendv instruction is hard coded to use xmm0 as mask
+%macro PBLENDVB 3 ; dst/src_a, src_b, mask
+%if cpuflag(avx)
+    %if cpuflag(avx) && notcpuflag(avx2) && sizeof%1 >= 32
+        %error pblendb not possible with ymm on avx1, try blendvps.
+    %endif
+    pblendvb  %1, %1, %2, %3
+%elif cpuflag(sse4)
+    %ifnidn %3,xmm0
+        %error sse41 pblendvd uses xmm0 as default 3d operand, you used %3
+    %endif
+    pblendvb  %1, %2, %3
+%else
+    pxor  %2, %1
+    pand  %2, %3
+    pxor  %1, %2
+%endif
+%endmacro
-- 
2.13.2