diff mbox

[FFmpeg-devel] v5 Opus Pyramid Vector Quantization Search in x86 SIMD asm

Message ID CABA=pqdZNhBsg5wgP3CM6jgOjCWqb2OY41jduTPnVUBXJ7KZsg@mail.gmail.com
State Superseded
Headers show

Commit Message

Ivan Kalvachev July 22, 2017, 11:18 a.m. UTC
This patch is ready for review and inclusion.

Explanation of what it does and how it works
could be found in the previous WIP threads:
[v1] http://ffmpeg.org/pipermail/ffmpeg-devel/2017-June/212146.html
[v2] http://ffmpeg.org/pipermail/ffmpeg-devel/2017-June/212816.html
[v3] http://ffmpeg.org/pipermail/ffmpeg-devel/2017-July/213030.html
[v4] http://ffmpeg.org/pipermail/ffmpeg-devel/2017-July/213436.html

The changes compared to WIP v4 are small:
 - Using r4d ops to clear the high bits on int32 arguments.
 - Correctly map the cglobal registry usage.
 - Use SSE4 instead of SSE42, since blend is only SSE4.1.
 - Fix building with --disable-x86asm .

 - Remove testing defines.
Loading constants in registers is (now) always same or better speed.
Avoiding stall forwarding is faster on all CPU except Ryzen.
On Ryzen the alternative is about 7 cycles faster, that's why
I've left the code disabled, but without define.
I've also left the two other defines, as they are useful
for debugging and creating binary identical results
to other algorithms.

 - Disable the 256bit AVX2 variant usage.
I'm leaving the code in the assembly as disabled,
in case it is useful in future.

---

I'm including some of the benchmarks.
Some data is removed, since it was used to test different methods.
Benchmarks are done at default settings (96kbps),
but with different samples. All samples are above 1h long.

In summary, the function is about 2-3x faster
than the improved FFmpeg C version.

===========================================================
 K10  AMD Phenom(tm) II X4 945 Processor
//v4
      706   706   706   706   706            // NULL
     4146  4161  4169  4184  4188  4328 4379 // SSE2
     4988  5015  5016  5030  5185            // USE_APPROXIMATION  0
    13860 13828 13846 13846 13831            // C

===========================================================
 Pentium Dual Core E5800
//V4
    3006 3012 3019 3023 3025 // SSE2
    9066 9071 9074 9077 9081 // C

//===========================================================
 Ryzen 1800X
//v3
     357                     // NULL
    1999 2001 2004           // AVX1 GCC
    2010 2029                // SSE4 MSVC
    2012 2026 2027           // AVX1 MSVC
    2166 2170 2171           // AVX2 & STALL_WRITE_FORWARDING 1
    2176 2179 2180 2180 2189 // AVX2
    2226 2230 2234           // AVX2 & USE_APPROXIMATION 0
    6216 6162 6162           // C only GCC
   61909 61545               // C only MSVC
//v4
    1931 1933 1935           // v4 AVX1
    2096 2097 2098           // v4 AVX2 & STALL_WRITE_FORWARDING 1
    2103 2110 2112           // v4 AVX2

//===========================================================
 Intel(R) Core(TM) i7-3930K CPU
//v3
     272            // NULL
    1755 1756 1764  // AVX1
    1847 1855 1866  // SSE4
    2003 2009       // USE_APPROXIMATION  00
    2103 2110 2112  // AVX2
    4855 4856       // C only

//===========================================================
 SkyLake i7 6700HQ
//v2
     264                        // NULL
    1764 1765 1772 1773 1780    // SSE4
    1782 1782 1787 1795 1796    // AVX1
    1805 1807 1807 1811 1815    // AVX1 & USE_APPROXIMATION 0
    1826 1827 1828 1833 1833    // SSE2
    1850 1853 1857 1857 1868    // AVX2
    6878 6934 6879 6921 6899    // C

-b:a 48kbps, 96kbps, 510kbps
sse4:  2049,   1826,     955
sse2:  2065,   1874,     943
avx:   2106,   1868,     950
c:     9202,   7080,    1392

Comments

Rostislav Pehlivanov July 23, 2017, 12:47 a.m. UTC | #1
On 22 July 2017 at 12:18, Ivan Kalvachev <ikalvachev@gmail.com> wrote:

> This patch is ready for review and inclusion.
>
>
>
>+%macro emu_blendvps 3 ; dst/src_a, src_b, mask
>+%macro lea_pic_base 2; reg, const_label
Capitalize macro names.


>+      %error sse41 blendvps uses xmm0 as default 3d operand, you used %3
Remove this error

>+    %error "blendvps" emulation needs SSE
Definitely remove this error too.

>+        %error AVX1 does not support integer 256bit ymm operations
And this one is particularly pointless...

>+      %error sse41 blendvps uses xmm0 as default 3 operand, you used %3
Same...

>+    %error "broadcastss" emulation needs SSE
Same...

>+    %error "pbroadcastd" emulation needs SSE2
Yep, the same...

>+    %error pick HSUMPS implementation
Again...

All of these are pointless and unnecessary. Always assume at least SSE2.


>+
>+; 256bit version seems slower than the 128bit AVX on all current CPU's
>+; So disable it for now and keep the code in case something changes in
future.
>+%if HAVE_AVX2_EXTERNAL > 0 && 0
>+INIT_YMM avx2
>+PVQ_FAST_SEARCH
>+%endif

Remove this altogether.


>+%if 1
>+    emu_pbroadcastd m1,   xmm1
...
>+%else
>+        ; Ryzen is the only CPU at the moment that doesn't have
>+        ; write forwarding stall and where this code is faster
>+        ; with about 7 cycles on avarage.
>+        %{1}ps      m5,   mm_const_float_1
>+        movss       [tmpY + r4q], xmm5      ; Y[max_idx] +-= 1.0;

Remove the %else and always use the first bit of code.


>+%if cpuflag(avx2) && 01
>+%elif cpuflag(avx) && 01
>+%if cpuflag(avx2) && 01

Remove the && 01 in these 3 places.


>+;      vperm2f128   %1,   %1,   %1,   0 ; ymm, ymm, ymm     ; 2-3c 1/1

Remove this commented out code.


>+cglobal pvq_search, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
>+%if ARCH_X86_64 == 0    ;sbrdsp

You're using more than 11 xmm regs, so the entire code is always going to
need a 64 bit system.
So wrap the entire file, from top to bottom after the %include in
%if ARCH_X86_64
<everything>
%endif


>+SECTION_RODATA 64
>+
>+const_float_abs_mask:   times 8 dd 0x7fffffff
>+const_align_abs_edge:   times 8 dd 0
>+
>+const_float_0_5:        times 8 dd 0.5
>+const_float_1:          times 8 dd 1.0
>+const_float_sign_mask:  times 8 dd 0x80000000
>+
>+const_int32_offsets:
>+                        %rep 8
>+                                dd $-const_int32_offsets
>+                        %endrep
>+SECTION .text

This whole thing needs to go right at the very top after the %include. All
macros must then follow it.
Having read only sections in the middle of the file is very confusing and
not at all what all of our asm code follows.
Ivan Kalvachev July 24, 2017, 12:49 p.m. UTC | #2
On 7/23/17, Rostislav Pehlivanov <atomnuker@gmail.com> wrote:
> On 22 July 2017 at 12:18, Ivan Kalvachev <ikalvachev@gmail.com> wrote:
>
>> This patch is ready for review and inclusion.
>>
>>
>>
>>+%macro emu_blendvps 3 ; dst/src_a, src_b, mask
>>+%macro lea_pic_base 2; reg, const_label
> Capitalize macro names.

Done for the later.
About the former...

I do not like to use capitalization for the emulated instructions:

1. In C macros are always in capital letters to separate them from the
functions.
In ASM functions cannot be mistaken for macros.

2. All instructions are lowercase, even the ones that are overloaded
by x86asm.h macros.

3. There are already about 30 macros with lower cases in
libavcodec/x86. The rule is not absolute.

4. There are some emulated instructions in x86util, that are all upper
case names and
I do find them very ugly when I see them in the code.
This is why I went with "emu_<op>" route.
I actually want to encourage using the emu_<op> for them too (as
alternative names).

5. There is nothing guaranteeing that the assembler
should be case sensitive.
Actually nasm manual mentions that MASM (on which
it it is based on) is not case-sensitive (by default).
While nasm and yasm are case sensitive,
there are still %idefine and %imacro that
could create some confusion.

Anyway.

Would you accept a change where the emulation macros
are moved to a separate file and the macros are
renamed to EMU_<op> , aka EMU_blendvps ?
I'm thinking of "libavcodec/x86/x86emu.asm".

Or maybe they should be put in libavutil/x86/x86util.asm ,
however that could open a new can of worms.
AKA using the ugly uppercase only names.

>>+      %error sse41 blendvps uses xmm0 as default 3d operand, you used %3
> Remove this error

I'd like to keep that one.
It helps finding out why the code does not compile.

Sometimes it may not be obvious, since SWAP might
change the registers under the hood.

>
>>+    %error "blendvps" emulation needs SSE
> Definitely remove this error too.

Done

>>+        %error AVX1 does not support integer 256bit ymm operations
> And this one is particularly pointless...

On the contrary. AVX1 does support 256 bit ymm float point operations
but not integer ones. It is quite trivial mistake to use one with the other.

What is worse, without this the wrong code would compile
without any hint of error, because avx2 codes are
not overloaded by the current x86asm.h
so you won't get warning that you are using avx2 ops in avx1 section.

>>+      %error sse41 blendvps uses xmm0 as default 3 operand, you used %3
> Same...

Again, I'd like to keep that one.

>>+    %error "broadcastss" emulation needs SSE
> Same...

Done

>>+    %error "pbroadcastd" emulation needs SSE2
> Yep, the same...

Done

>
>>+    %error pick HSUMPS implementation
> Again...

I thought I had taken care of that.
Done

> All of these are pointless and unnecessary. Always assume at least SSE2.

NEVER assume anything (that you can check).
Making wrong assumptions is the easiest way
to make a mistakes that are
very hard to notice, find and correct.

>>+
>>+; 256bit version seems slower than the 128bit AVX on all current CPU's
>>+; So disable it for now and keep the code in case something changes in
> future.
>>+%if HAVE_AVX2_EXTERNAL > 0 && 0
>>+INIT_YMM avx2
>>+PVQ_FAST_SEARCH
>>+%endif
>
> Remove this altogether.

I think this would be a mistake.

BTW, would you do a proper benchmarks with the current
code and post the results. There is a chance that the
code has improved.
(Please, use a real audio sample, not generated noise).

I also asked you to try something (changing "cmpless" to "cmpps" ),
that you totally forgot to do.
(IACA says there is no penalty in my code for using this SSE op in AVX2 block,
but as I said, never assume.)

>>+%if 1
>>+    emu_pbroadcastd m1,   xmm1
> ...
>>+%else
>>+        ; Ryzen is the only CPU at the moment that doesn't have
>>+        ; write forwarding stall and where this code is faster
>>+        ; with about 7 cycles on avarage.
>>+        %{1}ps      m5,   mm_const_float_1
>>+        movss       [tmpY + r4q], xmm5      ; Y[max_idx] +-= 1.0;
>
> Remove the %else and always use the first bit of code.

I don't like removing code that is faster on a new CPU.
But since you insist.
Done.

>>+%if cpuflag(avx2) && 01
>>+%elif cpuflag(avx) && 01
>>+%if cpuflag(avx2) && 01
>
> Remove the && 01 in these 3 places.

Done


>>+;      vperm2f128   %1,   %1,   %1,   0 ; ymm, ymm, ymm     ; 2-3c 1/1
>
> Remove this commented out code.

Done

>
>>+cglobal pvq_search, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
>>+%if ARCH_X86_64 == 0    ;sbrdsp
>
> You're using more than 11 xmm regs, so the entire code is always going to
> need a 64 bit system.
> So wrap the entire file, from top to bottom after the %include in
> %if ARCH_X86_64
> <everything>
> %endif

I'm using exactly 8 registers on 32 bit arch.
I'm using 3 extra registers on 64 bit ones to hold some constants.

If you insist, I'll write some ifdef-ery to
always supply the correct number of registers.
From what I've seen in x86asm, the >8 case is only checked on WIN64.

>>+SECTION_RODATA 64
>>+
>>+const_float_abs_mask:   times 8 dd 0x7fffffff
>>+const_align_abs_edge:   times 8 dd 0
>>+
>>+const_float_0_5:        times 8 dd 0.5
>>+const_float_1:          times 8 dd 1.0
>>+const_float_sign_mask:  times 8 dd 0x80000000
>>+
>>+const_int32_offsets:
>>+                        %rep 8
>>+                                dd $-const_int32_offsets
>>+                        %endrep
>>+SECTION .text
>
> This whole thing needs to go right at the very top after the %include. All
> macros must then follow it.
> Having read only sections in the middle of the file is very confusing and
> not at all what all of our asm code follows.

It's very simple:
  pre-processor, constants, code.
Your confusion comes from the fact that
the code templates are done by macros.

As I've told you before, the macros before
the constants belong to a separate file.

Also why are you so obsessed
with constants been the first thing in a file.
You are not supposed to mess with them
on regular basis.

Constant labels must be before the code that
uses them, but because of code templates
they could be literally few lines before the end of the file.


Best Regards.
diff mbox

Patch

From 522ed9d47db739c9c0559f4c040ef5262c685335 Mon Sep 17 00:00:00 2001
From: Ivan Kalvachev <ikalvachev@gmail.com>
Date: Thu, 8 Jun 2017 22:24:33 +0300
Subject: [PATCH 1/5] SIMD opus pvq_search implementation

Explanation on the workings and methods used by the
Pyramid Vector Quantization Search function
could be found in the following Work-In-Progress mail threads:
http://ffmpeg.org/pipermail/ffmpeg-devel/2017-June/212146.html
http://ffmpeg.org/pipermail/ffmpeg-devel/2017-June/212816.html
http://ffmpeg.org/pipermail/ffmpeg-devel/2017-July/213030.html
http://ffmpeg.org/pipermail/ffmpeg-devel/2017-July/213436.html

Signed-off-by: Ivan Kalvachev <ikalvachev@gmail.com>
---
 libavcodec/opus_pvq.c              |   3 +
 libavcodec/opus_pvq.h              |   5 +-
 libavcodec/x86/Makefile            |   2 +
 libavcodec/x86/opus_dsp_init.c     |  43 +++
 libavcodec/x86/opus_pvq_search.asm | 554 +++++++++++++++++++++++++++++++++++++
 5 files changed, 605 insertions(+), 2 deletions(-)
 create mode 100644 libavcodec/x86/opus_dsp_init.c
 create mode 100644 libavcodec/x86/opus_pvq_search.asm

diff --git a/libavcodec/opus_pvq.c b/libavcodec/opus_pvq.c
index 2ac66a0ede..3aa502929c 100644
--- a/libavcodec/opus_pvq.c
+++ b/libavcodec/opus_pvq.c
@@ -947,6 +947,9 @@  int av_cold ff_celt_pvq_init(CeltPVQ **pvq)
     s->encode_band        = pvq_encode_band;
     s->band_cost          = pvq_band_cost;
 
+    if (ARCH_X86 && CONFIG_OPUS_ENCODER)
+        ff_opus_dsp_init_x86(s);
+
     *pvq = s;
 
     return 0;
diff --git a/libavcodec/opus_pvq.h b/libavcodec/opus_pvq.h
index 6691494838..9246337360 100644
--- a/libavcodec/opus_pvq.h
+++ b/libavcodec/opus_pvq.h
@@ -33,8 +33,8 @@ 
                                        float *lowband_scratch, int fill)
 
 struct CeltPVQ {
-    DECLARE_ALIGNED(32, int,   qcoeff      )[176];
-    DECLARE_ALIGNED(32, float, hadamard_tmp)[176];
+    DECLARE_ALIGNED(32, int,   qcoeff      )[256];
+    DECLARE_ALIGNED(32, float, hadamard_tmp)[256];
 
     float (*pvq_search)(float *X, int *y, int K, int N);
 
@@ -45,6 +45,7 @@  struct CeltPVQ {
 };
 
 int  ff_celt_pvq_init  (struct CeltPVQ **pvq);
+void ff_opus_dsp_init_x86(struct CeltPVQ *s);
 void ff_celt_pvq_uninit(struct CeltPVQ **pvq);
 
 #endif /* AVCODEC_OPUS_PVQ_H */
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 0dbc46504e..9875f48797 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -52,6 +52,7 @@  OBJS-$(CONFIG_APNG_DECODER)            += x86/pngdsp_init.o
 OBJS-$(CONFIG_CAVS_DECODER)            += x86/cavsdsp.o
 OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o x86/synth_filter_init.o
 OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
+OBJS-$(CONFIG_OPUS_ENCODER)            += x86/opus_dsp_init.o
 OBJS-$(CONFIG_HEVC_DECODER)            += x86/hevcdsp_init.o
 OBJS-$(CONFIG_JPEG2000_DECODER)        += x86/jpeg2000dsp_init.o
 OBJS-$(CONFIG_MLP_DECODER)             += x86/mlpdsp_init.o
@@ -123,6 +124,7 @@  X86ASM-OBJS-$(CONFIG_MDCT15)           += x86/mdct15.o
 X86ASM-OBJS-$(CONFIG_ME_CMP)           += x86/me_cmp.o
 X86ASM-OBJS-$(CONFIG_MPEGAUDIODSP)     += x86/imdct36.o
 X86ASM-OBJS-$(CONFIG_MPEGVIDEOENC)     += x86/mpegvideoencdsp.o
+X86ASM-OBJS-$(CONFIG_OPUS_ENCODER)     += x86/opus_pvq_search.o
 X86ASM-OBJS-$(CONFIG_PIXBLOCKDSP)      += x86/pixblockdsp.o
 X86ASM-OBJS-$(CONFIG_QPELDSP)          += x86/qpeldsp.o                 \
                                           x86/fpel.o                    \
diff --git a/libavcodec/x86/opus_dsp_init.c b/libavcodec/x86/opus_dsp_init.c
new file mode 100644
index 0000000000..f4c25822db
--- /dev/null
+++ b/libavcodec/x86/opus_dsp_init.c
@@ -0,0 +1,43 @@ 
+/*
+ * Opus encoder assembly optimizations
+ * Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/opus_pvq.h"
+
+extern float ff_pvq_search_sse2(float *X, int *y, int K, int N);
+extern float ff_pvq_search_sse4(float *X, int *y, int K, int N);
+extern float ff_pvq_search_avx (float *X, int *y, int K, int N);
+
+av_cold void ff_opus_dsp_init_x86(CeltPVQ *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags))
+        s->pvq_search = ff_pvq_search_sse2;
+
+    if (EXTERNAL_SSE4(cpu_flags))
+        s->pvq_search = ff_pvq_search_sse4;
+
+    if (EXTERNAL_AVX(cpu_flags))
+        s->pvq_search = ff_pvq_search_avx;
+}
diff --git a/libavcodec/x86/opus_pvq_search.asm b/libavcodec/x86/opus_pvq_search.asm
new file mode 100644
index 0000000000..a80d1c79fa
--- /dev/null
+++ b/libavcodec/x86/opus_pvq_search.asm
@@ -0,0 +1,554 @@ 
+;******************************************************************************
+;* SIMD optimized Opus encoder DSP function
+;*
+;* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "config.asm"
+%include "libavutil/x86/x86util.asm"
+
+%ifdef __NASM_VER__
+%use "smartalign"
+ALIGNMODE p6
+%endif
+
+; Use float op that give half precision but execute for around 3 cycles.
+; On Skylake & Ryzen the division is much faster (around 11c/3),
+; that makes the full precision code about 2% slower.
+; Opus also does use rsqrt approximation in their intrinsics code.
+%define USE_APPROXIMATION  01
+
+; Presearch tries to quantize by using the property Sum( abs(x[i]*K)/Sx ) = K.
+; If we use truncation for the division result then the integer Sum would be <= K.
+; By using nearest rounding we get closer approximation, but
+; we could also get more than K pulses and we have to remove the extra ones.
+; This method is the main improvement of the ffmpeg C function over the Opus original.
+%define PRESEARCH_ROUNDING 01
+
+
+
+;
+; Horizontal Sum Packed Single precision float
+; The resulting sum is in all elements.
+;
+%macro HSUMPS 2 ; dst/src, tmp
+%if cpuflag(avx)
+  %if sizeof%1>=32  ; avx
+       vperm2f128   %2,   %1,   %1,   (0)*16+(1)   ; %2=lo(%1)<<128+hi(%1)
+       vaddps       %1,   %2
+  %endif
+       vshufps      %2,   %1,   %1,   q1032
+       vaddps       %1,   %2
+       vshufps      %2,   %1,   %1,   q0321
+       vaddps       %1,   %2
+
+%elif 01    ; this form is a bit faster than the short avx-like emulation.
+        movaps      %2,   %1            ;    [d,       c,       b,       a       ]
+        shufps      %1,   %1,   q1032   ; %2=[b,       a,       d,       c       ]
+        addps       %1,   %2            ; %1=[b+d,     a+c,     d+b,     c+a     ]
+        movaps      %2,   %1
+        shufps      %1,   %1,   q0321   ; %2=[c+a,     b+d,     a+c,     d+b     ]
+        addps       %1,   %2            ; %1=[c+a+b+d, b+d+a+c, a+c+d+b, d+b+c+a ]
+        ; all %1 members should be equal for as long as float a+b==b+a
+%else
+    %error pick HSUMPS implementation
+%endif
+%endmacro
+
+;
+; Emulate blendvps if not available
+;
+; src_b destroyed when using emulation with logical operands
+; SSE41 blend instruction is hard fixed to use xmm0 as mask
+;
+%macro emu_blendvps 3 ; dst/src_a, src_b, mask
+%if cpuflag(avx)
+       vblendvps    %1,   %1,   %2,   %3
+;-------------------------
+%elif cpuflag(sse4)
+  %if notcpuflag(avx)
+    %ifnidn %3,xmm0
+      %error sse41 blendvps uses xmm0 as default 3d operand, you used %3
+    %endif
+  %endif
+        blendvps    %1,   %2,   %3
+;-------------------------
+%elif cpuflag(sse)
+        xorps       %2,   %1
+        andps       %2,   %3
+        xorps       %1,   %2
+%else
+    %error "blendvps" emulation needs SSE
+%endif
+%endmacro
+
+;
+; Emulate pblendvb if not available
+;
+; src_b destroyed when using emulation with logical operands
+; SSE41 blend instruction is hard fixed to use xmm0 as mask
+;
+%macro emu_pblendvb 3 ; dst/src_a, src_b, mask
+%if cpuflag(avx)
+  %if cpuflag(avx) && notcpuflag(avx2) && sizeof%1 >= 32
+        %error AVX1 does not support integer 256bit ymm operations
+  %endif
+
+       vpblendvb    %1,   %1,   %2,   %3
+;-------------------------
+%elif cpuflag(sse4)
+    %ifnidn %3,xmm0
+      %error sse41 blendvps uses xmm0 as default 3 operand, you used %3
+    %endif
+        pblendvb    %1,   %2,   %3
+;-------------------------
+%else
+        pxor        %2,   %1
+        pand        %2,   %3
+        pxor        %1,   %2
+%endif
+%endmacro
+
+;
+; Emulate broadcastss if not available
+;
+%macro emu_broadcastss 2 ; dst, src
+%if cpuflag(avx2) && 01
+       vbroadcastss %1,   %2            ; ymm, xmm
+;-------------------------
+%elif cpuflag(avx) && 01
+  %ifnum sizeof%2       ; avx1 register
+       vpermilps    xmm%1, xmm%2, q0000 ; xmm, xmm, imm || ymm, ymm, imm
+    %if sizeof%1 >= 32  ;mmsize>=32
+       vinsertf128  %1,   %1,xmm%1,   1 ; ymm, ymm, xmm, im ; 3c   1/1
+;      vperm2f128   %1,   %1,   %1,   0 ; ymm, ymm, ymm     ; 2-3c 1/1
+    %endif
+  %else                 ; avx1 memory
+       vbroadcastss %1,   %2            ; ymm, mm32 || xmm, mm32
+  %endif
+;-------------------------
+%elif cpuflag(sse) && 1
+  %ifnum sizeof%2       ; sse register
+        shufps      %1,   %2,   %2,   q0000
+  %else                 ; sse memory
+        movss       %1,   %2            ; this zeroes the other 3 elements
+        shufps      %1,   %1,  0
+  %endif
+%else
+    %error "broadcastss" emulation needs SSE
+%endif
+%endmacro
+
+;
+; Emulate pbroadcastd if not available
+;
+%macro emu_pbroadcastd 2 ; dst, src
+%if cpuflag(avx2) && 01
+       vpbroadcastd %1,   %2
+%elif cpuflag(avx) && mmsize >= 32
+        %error AVX1 does not support integer 256bit ymm operations
+%elif cpuflag(sse2)
+  %ifnum sizeof%2       ; sse2 register
+        pshufd      %1,   %2,   q0000
+  %else                 ; sse memory
+        movd        %1,   %2            ; movd zero extends
+        pshufd      %1,   %1,   0
+  %endif
+%else
+    %error "pbroadcastd" emulation needs SSE2
+%endif
+%endmacro
+
+;
+;   Setup High register to be used
+;   for holding memory constants
+;
+; %1 - the register to be used, assmues it is >= mm8
+; %2 - name of the constant.
+;
+; Subsequent opcodes are going to use the constant in the form
+; "addps m0, mm_const_name" and it would be turned into:
+; "addps m0, [const_name]" on 32 bit arch or
+; "addps m0, m8" on 64 bit arch
+%macro set_hi_mm_constant 3 ; movop, reg, const_name
+%if num_mmregs > 8
+       %define      mm_%3 %2
+       %{1}         %2,   [%3]  ; movaps m8, [const_name]
+%else
+       %define      mm_%3 [%3]
+%endif
+%endmacro
+
+;
+;   Load Effective Address of
+;     Position Independent Code
+;       Base address of a constant
+; %1 - the register to be used, if PIC is set
+; %2 - name of the constant.
+;
+; Subsequent opcode are going to use the base address in the form
+; "movaps m0, [pic_base_constant_name+r4q]" and it would be turned into
+; "movaps m0, [r5q + r4q]" if PIC is enabled
+; "movaps m0, [constant_name + r4q]" if texrel are used
+%macro lea_pic_base 2; reg, const_label
+%ifdef PIC
+        lea         %1,  [%2]    ; rip+const
+        %define     pic_base_%2 %1
+%else
+        %define     pic_base_%2 %2
+%endif
+%endmacro
+
+;
+; We need one more register for
+; PIC relative addressing. Use this
+; to count it in cglobal
+;
+%ifdef PIC
+  %define num_pic_regs 1
+%else
+  %define num_pic_regs 0
+%endif
+
+
+
+
+SECTION_RODATA 64
+
+const_float_abs_mask:   times 8 dd 0x7fffffff
+const_align_abs_edge:   times 8 dd 0
+
+const_float_0_5:        times 8 dd 0.5
+const_float_1:          times 8 dd 1.0
+const_float_sign_mask:  times 8 dd 0x80000000
+
+const_int32_offsets:
+                        %rep 8
+                                dd $-const_int32_offsets
+                        %endrep
+SECTION .text
+
+
+
+
+%macro PULSES_SEARCH 1
+; m6 Syy_norm
+; m7 Sxy_norm
+        addps       m6,   mm_const_float_0_5    ; Syy_norm += 1.0/2
+        pxor        m1,   m1            ; max_idx
+        xorps       m3,   m3            ; p_max
+        xor         r4q,  r4q
+align 16
+%%distortion_search:
+        movd        xmm2, dword r4d     ; movd zero extends
+%ifidn %1,add
+        movaps      m4,   [tmpY + r4q]  ; y[i]
+        movaps      m5,   [tmpX + r4q]  ; X[i]
+
+  %if   USE_APPROXIMATION == 1
+        xorps       m0,   m0
+        cmpps       m0,   m0,   m5,   4 ; m0 = (X[i] != 0.0)
+  %endif
+
+        addps       m4,   m6            ; m4 = Syy_new = y[i] + Syy_norm
+        addps       m5,   m7            ; m5 = Sxy_new = X[i] + Sxy_norm
+
+  %if   USE_APPROXIMATION == 1
+        andps       m5,   m0            ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
+  %endif
+
+%else
+        movaps      m5,   [tmpY + r4q]          ; m5 = y[i]
+
+        xorps       m0,   m0                    ; m0 = 0;
+        cmpps       m0,   m0,   m5,   1         ; m0 = (0<y)
+
+        subps       m4,   m6,   m5              ; m4 = Syy_new = Syy_norm - y[i]
+        subps       m5,   m7,   [tmpX + r4q]    ; m5 = Sxy_new = Sxy_norm - X[i]
+        andps       m5,   m0                    ; (0<y)?m5:0
+%endif
+
+%if   USE_APPROXIMATION == 1
+        rsqrtps     m4,   m4
+        mulps       m5,   m4            ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
+%else
+        mulps       m5,   m5
+        divps       m5,   m4            ; m5 = p = Sxy_new*Sxy_new/Syy
+%endif
+    emu_pbroadcastd m2,   xmm2          ; m2=i (all lanes get same values, we add the offset-per-lane, later)
+
+        cmpps       m0,   m3,   m5,   1 ; m0 = (m3 < m5) ; (p_max < p) ; (p > p_max)
+        maxps       m3,   m5            ; m3=max(p_max,p)
+                                        ; maxps here is faster than blendvps, despite blend having lower latency.
+
+        pand        m2,   m0            ; This version seems faster than sse41 pblendvb
+        pmaxsw      m1,   m2            ; SSE2 signed word, so it would work for N < 32768/4
+
+        add         r4q,  mmsize
+        cmp         r4q,  Nq
+        jb          %%distortion_search
+
+        por         m1,   mm_const_int32_offsets    ; max_idx offsets per individual lane (skipped in the inner loop)
+        movdqa      m4,   m1                        ; needed for the aligned y[max_idx]+=1; processing
+
+%if mmsize >= 32
+; Merge parallel maximums round 8 (4 vs 4)
+
+       vextractf128 xmm5, ymm3, 1       ; xmm5 = ymm3[1x128] = ymm3[255..128b]
+       vcmpps       xmm0, xmm3, xmm5, 1 ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] )
+
+       vextracti128 xmm2, ymm1, 1       ; xmm2 = ymm1[1x128] = ymm1[255..128b]
+    emu_blendvps    xmm3, xmm5, xmm0    ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128]
+    emu_pblendvb    xmm1, xmm2, xmm0    ; p       = m0 ? p[1x128]       : p[0x128]
+%endif
+
+; Merge parallel maximums round 4 (2 vs 2)
+                                               ; m3=p[3210]
+        movhlps     xmm5, xmm3                 ; m5=p[xx32]
+        cmpps       xmm0, xmm3, xmm5, 1 ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] )
+
+        pshufd      xmm2, xmm1, q3232
+    emu_blendvps    xmm3, xmm5, xmm0    ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0]
+    emu_pblendvb    xmm1, xmm2, xmm0    ; p       = m0 ? p[3,2]       : p[1,0]
+
+; Merge parallel maximums final round (1 vs 1)
+        movaps      xmm0, xmm3          ; m0 = m3[0] = p[0]
+        shufps      xmm3, xmm3, q1111   ; m3 = m3[1] = p[1]
+        cmpless     xmm0, xmm3
+
+        pshufd      xmm2, xmm1, q1111
+    emu_pblendvb    xmm1, xmm2, xmm0
+
+        movd  dword r4d,  xmm1          ; zero extends to the rest of r4q
+
+    emu_broadcastss m3,   [tmpX + r4q]
+        %{1}ps      m7,   m3            ; Sxy += X[max_idx]
+
+    emu_broadcastss m5,   [tmpY + r4q]
+        %{1}ps      m6,   m5            ; Syy += Y[max_idx]
+
+        ; We have to update a single element in Y[i]
+        ; However writing 4 bytes and then doing 16 byte load in the inner loop
+        ; could cause a stall due to breaking write forwarding.
+%if 1
+    emu_pbroadcastd m1,   xmm1
+        pcmpeqd     m1,   m1,   m4          ; exactly 1 element matches max_idx and this finds it
+
+        and         r4q,  ~(mmsize-1)       ; align address down, so the value pointed by max_idx is inside a mmsize load
+        movaps      m5,   [tmpY + r4q]      ; m5 = Y[y3...ym...y0]
+        andps       m1,   mm_const_float_1  ; m1 =  [ 0...1.0...0]
+        %{1}ps      m5,   m1                ; m5 = Y[y3...ym...y0] +/- [0...1.0...0]
+        movaps      [tmpY + r4q], m5        ; Y[max_idx] +-= 1.0;
+%else
+        ; Ryzen is the only CPU at the moment that doesn't have
+        ; write forwarding stall and where this code is faster
+        ; with about 7 cycles on avarage.
+        %{1}ps      m5,   mm_const_float_1
+        movss       [tmpY + r4q], xmm5      ; Y[max_idx] +-= 1.0;
+%endif
+
+%endmacro
+
+;
+; Pyramid Vector Quantization Search implementation
+;
+; float * inX   - Unaligned (SIMD) access, it will be overread,
+;                 but extra data is masked away.
+; int32 * outY  - Should be aligned and padded buffer.
+;                 It is used as temp buffer.
+; uint32 K      - Number of pulses to have after quantizations.
+; uint32 N      - Number of vector elements. Must be 0 < N < 256
+;
+%macro PVQ_FAST_SEARCH 0
+cglobal pvq_search, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
+%define tmpX rsp
+%define tmpY outYq
+
+
+        movaps      m0,   [const_float_abs_mask]
+        shl         Nd,   2             ; N *= sizeof(float) ; also 32 bit operation zeroes the high 32 bits in 64 bit mode.
+        mov         r4q,  Nq
+
+        neg         r4q
+        and         r4q,  mmsize-1
+
+        lea_pic_base r5q, const_align_abs_edge    ; rip+const
+        movups      m3,   [pic_base_const_align_abs_edge + r4q - mmsize]
+
+        add         Nq,   r4q           ; Nq = align(Nq, mmsize)
+
+        lea         r4q,  [Nq - mmsize] ; Nq is rounded up (aligned up) to mmsize, so r4q can't become negative here, unless N=0.
+        movups      m2,   [inXq + r4q]
+        andps       m2,   m3
+        movaps      [tmpX + r4q], m2
+        movaps      m1,   m2            ; Sx
+
+align 16
+%%loop_abs_sum:
+        sub         r4q,  mmsize
+        jc          %%end_loop_abs_sum
+
+        movups      m2,   [inXq + r4q]
+        andps       m2,   m0
+
+        movaps      [tmpX + r4q], m2    ; tmpX[i]=abs(X[i])
+        addps       m1,   m2
+        jmp         %%loop_abs_sum
+
+align 16
+%%end_loop_abs_sum:
+
+        HSUMPS      m1,   m2            ; m1  = Sx
+
+        xorps       m0,   m0
+        comiss      xmm0, xmm1          ;
+        jz          %%zero_input        ; if (Sx==0) goto zero_input
+
+        cvtsi2ss    xmm0, dword Kd      ; m0  = K
+%if USE_APPROXIMATION == 1
+        rcpss       xmm1, xmm1          ; m1 = approx(1/Sx)
+        mulss       xmm0, xmm1          ; m0 = K*(1/Sx)
+%else
+        divss       xmm0, xmm1          ; b = K/Sx
+                                        ; b = K/max_x
+%endif
+
+    emu_broadcastss m0,   xmm0
+
+        lea         r4q,  [Nq - mmsize]
+        pxor        m5,   m5            ; Sy    ( Sum of abs( y[i]) )
+        xorps       m6,   m6            ; Syy   ( Sum of y[i]*y[i]  )
+        xorps       m7,   m7            ; Sxy   ( Sum of X[i]*y[i]  )
+align 16
+%%loop_guess:
+        movaps      m1,   [tmpX + r4q]  ; m1   = X[i]
+        mulps       m2,   m0,   m1      ; m2   = res*X[i]
+  %if PRESEARCH_ROUNDING == 0
+        cvttps2dq   m2,   m2            ; yt   = (int)truncf( res*X[i] )
+  %else
+        cvtps2dq    m2,   m2            ; yt   = (int)lrintf( res*X[i] )
+  %endif
+        paddd       m5,   m2            ; Sy  += yt
+        cvtdq2ps    m2,   m2            ; yt   = (float)yt
+        mulps       m1,   m2            ; m1   = X[i]*yt
+        movaps      [tmpY + r4q], m2    ; y[i] = m2
+        addps       m7,   m1            ; Sxy += m1;
+        mulps       m2,   m2            ; m2   = yt*yt
+        addps       m6,   m2            ; Syy += m2
+
+        sub         r4q,  mmsize
+        jnc         %%loop_guess
+
+        HSUMPS      m6,   m1        ; Syy_norm
+        HADDD       m5,   m4        ; pulses
+
+        movd  dword r4d,  xmm5      ; zero extends to the rest of r4q
+
+        sub         Kd,   r4d       ; K -= pulses , also 32 bit operation zeroes high 32 bit in 64 bit mode.
+        jz          %%finish        ; K - pulses == 0
+
+        set_hi_mm_constant movaps, m8,  const_float_0_5
+        set_hi_mm_constant movaps, m9,  const_float_1
+        set_hi_mm_constant movdqa, m10, const_int32_offsets
+        ; Use Syy/2 in distortion parameter calculations.
+        ; Saves pre and post-caclulation to correct Y[] values.
+        ; Same precision, since float mantisa is normalized.
+        ; The SQRT approximation does differ.
+        HSUMPS      m7,   m0        ; Sxy_norm
+        mulps       m6,   mm_const_float_0_5
+
+        jc          %%remove_pulses_loop    ; K - pulses < 0
+
+align 16                                    ; K - pulses > 0
+%%add_pulses_loop:
+
+        PULSES_SEARCH add   ; m6 Syy_norm ; m7 Sxy_norm
+
+        sub         Kd,   1
+        jnz         %%add_pulses_loop
+
+        addps       m6,   m6        ; Syy*=2
+
+        jmp         %%finish
+
+align 16
+%%remove_pulses_loop:
+
+        PULSES_SEARCH sub   ; m6 Syy_norm ; m7 Sxy_norm
+
+        add         Kd,   1
+        jnz         %%remove_pulses_loop
+
+        addps       m6,   m6        ; Syy*=2
+
+align 16
+%%finish:
+        lea         r4q,  [Nq - mmsize]
+
+        movaps      m2,   [const_float_sign_mask]
+align 16
+%%restore_sign_loop:
+        movaps      m0,   [tmpY + r4q]  ; m0 = Y[i]
+        movups      m1,   [inXq + r4q]  ; m1 = X[i]
+        andps       m1,   m2            ; m1 = sign(X[i])
+        orps        m0,   m1            ; m0 = Y[i]*sign
+        cvtps2dq    m3,   m0            ; m3 = (int)m0
+        movaps      [outYq + r4q], m3
+
+        sub         r4q,  mmsize
+        jnc         %%restore_sign_loop
+%%return:
+
+%if ARCH_X86_64 == 0    ;sbrdsp
+        movss       r0m,  xmm6      ; return (float)Syy_norm
+        fld         dword r0m
+%else
+        movaps      m0,   m6        ; return (float)Syy_norm
+%endif
+        RET
+
+align 16
+%%zero_input:
+        lea         r4q,  [Nq - mmsize]
+        xorps       m0,   m0
+%%zero_loop:
+        movaps      [outYq + r4q], m0
+        sub         r4q,  mmsize
+        jnc         %%zero_loop
+
+        movaps      m6,   [const_float_1]
+        jmp         %%return
+%endmacro
+
+
+INIT_XMM sse2
+PVQ_FAST_SEARCH
+
+INIT_XMM sse4
+PVQ_FAST_SEARCH
+
+INIT_XMM avx
+PVQ_FAST_SEARCH
+
+; 256bit version seems slower than the 128bit AVX on all current CPU's
+; So disable it for now and keep the code in case something changes in future.
+%if HAVE_AVX2_EXTERNAL > 0 && 0
+INIT_YMM avx2
+PVQ_FAST_SEARCH
+%endif
-- 
2.13.2