From 7d875390bb81973b6e4d1fb8ad54594ea3f8d5da Mon Sep 17 00:00:00 2001
From: Ivan Kalvachev <ikalvachev@gmail.com>
Date: Thu, 8 Jun 2017 22:24:33 +0300
Subject: [PATCH 1/3] SIMD opus pvq_search implementation v2
---
libavcodec/opus_pvq.c | 9 +
libavcodec/opus_pvq.h | 5 +-
libavcodec/x86/Makefile | 1 +
libavcodec/x86/opus_dsp_init.c | 47 +++
libavcodec/x86/opus_pvq_search.asm | 628 +++++++++++++++++++++++++++++++++++++
5 files changed, 688 insertions(+), 2 deletions(-)
create mode 100644 libavcodec/x86/opus_dsp_init.c
create mode 100644 libavcodec/x86/opus_pvq_search.asm
@@ -418,7 +418,13 @@ static uint32_t celt_alg_quant(OpusRangeCoder *rc, float *X, uint32_t N, uint32_
int *y = pvq->qcoeff;
celt_exp_rotation(X, N, blocks, K, spread, 1);
+
+ {
+ START_TIMER
gain /= sqrtf(pvq->pvq_search(X, y, K, N));
+ STOP_TIMER("pvq_search");
+ }
+
celt_encode_pulses(rc, y, N, K);
celt_normalize_residual(y, X, N, gain);
celt_exp_rotation(X, N, blocks, K, spread, 0);
@@ -947,6 +953,9 @@ int av_cold ff_celt_pvq_init(CeltPVQ **pvq)
s->encode_band = pvq_encode_band;
s->band_cost = pvq_band_cost;
+ if (ARCH_X86)
+ ff_opus_dsp_init_x86(s);
+
*pvq = s;
return 0;
@@ -33,8 +33,8 @@
float *lowband_scratch, int fill)
struct CeltPVQ {
- DECLARE_ALIGNED(32, int, qcoeff )[176];
- DECLARE_ALIGNED(32, float, hadamard_tmp)[176];
+ DECLARE_ALIGNED(32, int, qcoeff )[256];
+ DECLARE_ALIGNED(32, float, hadamard_tmp)[256];
float (*pvq_search)(float *X, int *y, int K, int N);
@@ -45,6 +45,7 @@ struct CeltPVQ {
};
int ff_celt_pvq_init (struct CeltPVQ **pvq);
+void ff_opus_dsp_init_x86(struct CeltPVQ *s);
void ff_celt_pvq_uninit(struct CeltPVQ **pvq);
#endif /* AVCODEC_OPUS_PVQ_H */
@@ -52,6 +52,7 @@ OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o
OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o x86/synth_filter_init.o
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o
+OBJS-$(CONFIG_OPUS_ENCODER) += x86/opus_dsp_init.o x86/opus_pvq_search.o
OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o
OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp_init.o
OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp_init.o
new file mode 100644
@@ -0,0 +1,47 @@
+/*
+ * Opus encoder assembly optimizations
+ * Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/opus_pvq.h"
+
+extern float ff_pvq_search_sse2 (float *X, int *y, int K, int N);
+extern float ff_pvq_search_sse42(float *X, int *y, int K, int N);
+extern float ff_pvq_search_avx (float *X, int *y, int K, int N);
+extern float ff_pvq_search_avx2 (float *X, int *y, int K, int N);
+
+av_cold void ff_opus_dsp_init_x86(CeltPVQ *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_SSE2(cpu_flags))
+ s->pvq_search = ff_pvq_search_sse2;
+
+ if (EXTERNAL_SSE42(cpu_flags))
+ s->pvq_search = ff_pvq_search_sse42;
+
+ if (EXTERNAL_AVX(cpu_flags))
+ s->pvq_search = ff_pvq_search_avx;
+
+ if (EXTERNAL_AVX2(cpu_flags))
+ s->pvq_search = ff_pvq_search_avx2;
+}
new file mode 100644
@@ -0,0 +1,628 @@
+; Opus encoder assembly optimizations
+; Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
+
+%include "config.asm"
+%include "libavutil/x86/x86util.asm"
+
+
+%define HADDPS_IS_FAST 0
+%define PHADDD_IS_FAST 0
+%define BLENDVPS_IS_FAST 01
+%define PBLENDVB_IS_FAST 01
+%define CONST_IN_X64_REG_IS_FASTER 01
+
+%define STALL_WRITE_FORWARDING 0
+%define SHORT_SYY_UPDATE 01
+
+%define USE_APPROXIMATION 01
+%define PRESEARCH_ROUNDING 01
+%define ALL_FLOAT_PRESEARCH cpuflag(sse42) && 0
+
+
+;
+; Horizontal Sum Packed Single precision float
+; The resulting sum is in all elements.
+;
+%macro HSUMPS 2 ; dst, tmp
+%if cpuflag(sse42) && HADDPS_IS_FAST
+ %if sizeof%1>=32 ; avx mmsize=32
+ vperm2f128 %2, %1, %1, (0)*16+(1) ; %2=lo(%1)<<128+hi(%1)
+ addps %1, %2
+ %endif ; sse&avx mmsize=16
+ haddps %1, %1 ; 5-6c 2/1
+ haddps %1, %1
+%elif cpuflag(avx)
+ %if sizeof%1>=32 ; avx
+ vperm2f128 %2, %1, %1, (0)*16+(1) ; %2=lo(%1)<<128+hi(%1)
+ addps %1, %2
+ %endif
+ vshufps %2, %1, %1, q1032 ; %2=[b, a, d, c ]
+ addps %1, %2 ; %1=[b+d, a+c, d+b, c+a ]
+ vshufps %2, %1, %1, q0321 ; %2=[c+a, b+d, a+c, d+b ]
+ addps %1, %2 ; %1=[c+a+b+d, b+d+a+c, a+c+d+b, d+b+c+a ]
+
+%elif 01 ; this form is a bit faster than the short avx-like emulation.
+ movaps %2, %1 ; [d, c, b, a ]
+ shufps %1, %1, q1032 ; %2=[b, a, d, c ]
+ addps %1, %2 ; %1=[b+d, a+c, d+b, c+a ]
+ movaps %2, %1
+ shufps %1, %1, q0321 ; %2=[c+a, b+d, a+c, d+b ]
+ addps %1, %2 ; %1=[c+a+b+d, b+d+a+c, a+c+d+b, d+b+c+a ]
+ ; all %1 members should be equal for as long as float a+b==b+a
+%else
+ %error pick HSUMPS implementation
+%endif
+%endmacro
+
+
+;
+; Horizontal Sum integer Double word
+; The resulting sum is in all elements.
+;
+%macro HSUMDQ 2 ; dst, tmp
+%if cpuflag(avx) && notcpuflag(avx2) && sizeof%1>=32
+ %error emulating integer 256bit ymm operation with float point op. there will be speed penalty.
+ vperm2f128 %2, %1, %1, (0)*16+(1) ; %2=lo(%2)<<128+hi(%2)
+ paddd xmm%1,xmm%2
+ phaddd xmm%1,xmm%1
+ phaddd xmm%1,xmm%1
+ vinsertf128 %1, %1,xmm%1, 1 ; ymm, ymm, xmm, im ; 3c 1/1
+%elif cpuflag(ssse3) && PHADDD_IS_FAST
+ %if sizeof%1>=32
+ vperm2i128 %2, %1, %1, (0)*16+(1) ; %2=lo(%2)<<128+hi(%2)
+ paddd %1, %2
+ %endif ;mmsize == 16
+ phaddd %1, %1 ;3 cycles on intel
+ phaddd %1, %1
+ ;6 cycles total (sse)
+%elif 01
+ %if cpuflag(avx2) && sizeof%1>=32 ;mmsize>=32
+ vperm2i128 %2, %1, %1, (0)*16+(1) ; %2=lo(%2)<<128+hi(%2)
+ paddd %1, %2
+ %endif
+ pshufd %2, %1, q1032 ; %2=[b, a, d, c ]
+ paddd %1, %2 ; %1=[b+d, a+c, d+b, c+a ]
+ pshufd %2, %1, q0321 ; %2=[c+a, b+d, a+c, d+b ]
+ paddd %1, %2 ; %1=[c+a+b+d, b+d+a+c, a+c+d+b, d+b+c+a ]
+ ;4 cycles total (sse)
+%else
+ %error pick HSUMDQ implementation
+%endif
+%endmacro
+
+
+;
+; Emulate blendvps if not available
+;
+; src_b destroyed when using emulation with logical operands
+; SSE41 blend instruction is hard fixed to use xmm0 as mask
+;
+%macro emu_blendvps 3 ; dst_a, src_b, mask
+%if cpuflag(avx) && BLENDVPS_IS_FAST
+ blendvps %1, %1, %2, %3
+%elif cpuflag(sse42) && BLENDVPS_IS_FAST
+ %if notcpuflag(avx)
+ %ifnidn %3,xmm0
+ %error sse41 blendvps uses xmm0 as default 3d operand, you used %3
+ %endif
+ %endif
+ blendvps %1, %2, %3
+%elif 01
+ xorps %2, %1
+ andps %2, %3
+ xorps %1, %2
+%else
+ %error pick "blendvps" emulation
+%endif
+%endmacro
+
+
+;
+; Emulate pblendvb if not available
+;
+; src_b destroyed when using emulation with logical operands
+; SSE41 blend instruction is hard fixed to use xmm0 as mask
+;
+%macro emu_pblendvb 3 ; dst_a, src_b, mask
+%if cpuflag(avx2) && PBLENDVB_IS_FAST
+ pblendvb %1, %1, %2, %3
+;-------------------------
+%elif cpuflag(avx) && notcpuflag(avx2) && PBLENDVB_IS_FAST
+ %if sizeof%1>=32
+;AVX1 does not implement pblendvb that works with ymm
+;AVX1 does not implemnt pxor that works with ymm either,
+; so no emulation possible
+; Use float point blend and hope the penalty is not severe
+ %error emulating integer 256bit ymm operation with float point op. there will be speed penalty.
+ vblendvps %1, %1, %2, %3
+ %else
+ vpblendvb %1, %1, %2, %3
+ %endif
+;-------------------------
+%elif cpuflag(sse42) && PBLENDVB_IS_FAST
+ %ifnidn %3,xmm0
+ %error sse41 blendvps uses xmm0 as default 3 operand, you used %3
+ %endif
+ pblendvb %1, %2, %3
+;-------------------------
+%elif 01
+;On my cpu, this version is actually faster than pblendvb !
+;pblendvb is much slower than blenvps, go figure!
+ pxor %2, %1
+ pand %2, %3
+ pxor %1, %2
+%else
+ %error pick "pblendvb" emulation
+%endif
+%endmacro
+
+
+;
+; Emulate broadcastss if not available
+;
+%macro emu_broadcastss 2 ; dst_a, src_b
+%if cpuflag(avx2) && 01
+ vbroadcastss %1, %2 ; ymm, xmm
+
+%elif cpuflag(avx) && 01
+ %ifnum sizeof%2 ; avx1 register
+ vpermilps xmm%1, xmm%2, q0000 ; xmm, xmm, imm || ymm, ymm, imm
+ %if sizeof%1 >= 32 ;mmsize>=32
+ vinsertf128 %1, %1,xmm%1, 1 ; ymm, ymm, xmm, im ; 3c 1/1
+; vperm2f128 %1, %1, %1, 0 ; ymm, ymm, ymm ; 2-3c 1/1
+ %endif
+ %else ; avx1 memory
+ vbroadcastss %1, %2 ; ymm, mm32 || xmm, mm32
+ %endif
+%elif cpuflag(sse) && 1
+ %ifnum sizeof%2 ; sse register
+ shufps %1, %2, %2, q0000
+ %else ; sse memory
+; xorps %1, %1 ; this seems to provide minor speedup
+ movss %1, %2 ; this zeroes the other 3 elements
+ shufps %1, %1, 0
+ %endif
+%else
+ %error invalid op, SSE needed for emulation
+%endif
+%endmacro
+
+;
+; Emulate pbroadcastd if not available
+;
+%macro emu_pbroadcastd 2 ; dst_a, src_b
+%if cpuflag(avx2) && 01
+ vpbroadcastd %1, %2
+%elif cpuflag(avx) && mmsize >= 32
+ ;there are no integer operations with 256bit ymm registers on AVX1
+ pshufd xmm%1, xmm%2, q0000 ; xmm, xmm, im
+ vinsertf128 %1, %1,xmm%1, 1 ; ymm, ymm, xmm. im
+ %error emulating integer 256bit ymm operation with float point op. there will be speed penalty.
+%elif cpuflag(sse)
+ %ifnum sizeof%2 ; sse register
+ pshufd %1, %2, q0000
+ %else ; sse memory
+; pxor %1, %1 ; this seems to provide minor speedup. On intel it is direct op, so no micro-ops, letency or throughput.
+ movd %1, %2 ; movd zero extends
+ pshufd %1, %1, 0
+ %endif
+%else
+ %error invalid op, SSE2 needed
+%endif
+%endmacro
+
+
+;big ugly hack
+%if cpuflag(avx) && notcpuflag(avx2) && mmsize == 32
+; AVX1 does not implement integer operations with 256bit ymm registers
+; Some operations however could be emulated with float point operations
+ %define ALL_FLOAT_PRESEARCH 1
+ %define pxor xorps
+ %define por orps
+ %define pand andps
+ %define vextracti128 vextractf128
+ %define emu_pblendvb emu_blendvps
+ %define emu_pbroadcastd emu_broadcastss
+%endif
+
+
+SECTION_RODATA 64
+
+const_float_abs_mask: times 8 dd 0x7fffffff
+const_align_abs_edge: times 8 dd 0
+
+const_float_0_5: times 8 dd 0.5
+const_float_1: times 8 dd 1.0
+const_float_2: times 8 dd 2.0
+const_float_N2 times 8 dd -2.0
+const_float_sign_mask: times 8 dd 0x80000000
+
+const_int32_offsets:
+ %rep 8
+ dd $-const_int32_offsets
+ %endrep
+SECTION .text
+
+
+
+
+
+%macro PULSES_SEARCH 1
+; m6 Syy_norm
+; m7 Sxy_norm
+ xor r4q, r4q
+ pxor m1, m1 ; max_idx
+
+%if USE_APPROXIMATION == 1
+ movaps m3, mm_const_float_N2
+%else
+ xorps m3, m3 ; p_max
+%endif
+ addps m6, mm_const_float_1 ; Syy_norm += 1.0
+align 16
+%%distortion_search:
+ movd xmm2, dword r4d ; movd zero extends
+%ifidn %1,add
+ movaps m4, [tmpY + r4q] ; y[i]
+ movaps m5, [tmpX + r4q] ; X[i]
+
+%if USE_APPROXIMATION == 2
+ xorps m0, m0
+ cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0)
+%endif
+
+ addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm
+ addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm
+
+%if USE_APPROXIMATION == 2
+ andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
+%endif
+
+; addps m4, m6, [tmpY + r4q] ; m4 = Syy_new = Syy_norm + y[i]
+; addps m5, m7, [tmpX + r4q] ; m5 = Sxy_new = Sxy_norm + X[i]
+
+; movaps m4, m6 ; Syy_norm
+; movaps m5, m7 ; Sxy_norm
+; addps m4, [tmpY + r4q] ; m4 = Syy_new = Syy_norm + y[i]
+; addps m5, [tmpX + r4q] ; m5 = Sxy_new = Sxy_norm + X[i]
+%else
+ movaps m5, [tmpY + r4q] ; m5 = y[i]
+
+ xorps m0, m0 ; m0 = 0;
+ cmpps m0, m0, m5, 1 ; m0 = (0<y)
+
+ subps m4, m6, m5 ; m4 = Syy_new = Syy_norm - y[i]
+ subps m5, m7, [tmpX + r4q] ; m5 = Sxy_new = Sxy_norm - X[i]
+ andps m5, m0 ; (0<y)?m5:0
+%endif
+
+%if USE_APPROXIMATION == 2
+ rsqrtps m4, m4
+ mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
+%elif USE_APPROXIMATION == 1
+ mulps m5, m5
+ subps m5, m4 ; This hacked formula improves precision, to compensate for the approximation.
+ rcpps m4, m4
+ mulps m5, m4 ; m5 = p = (Sxy_new*Sxy_new - Syy)*approx(1/Syy)
+%else
+ mulps m5, m5
+ divps m5, m4 ; m5 = p = Sxy_new*Sxy_new/Syy
+%endif
+ emu_pbroadcastd m2, xmm2 ; m2=i (all lanes get same values, we add the offset-per-lane, later)
+
+ cmpps m0, m3, m5, 1 ; m0 = (m3 < m5) ; (p_max < p) ; (p > p_max)
+ maxps m3, m5 ; m3=max(p_max,p)
+ ; maxps here is faster than blendvps, despite blend having lower latency.
+ ; maxps is also shorter so the inner loop fits in icache segment
+; emu_blendvps m3, m5, m0 ; m3 = m0 ? p : p_max
+ ; SSE4.1 is using hardcoded xmm0 register as third argument
+%if cpuflag(avx) && notcpuflag(avx2) && mmsize == 32
+ emu_pblendvb m1, m2, m0
+%elif 01
+ pand m2, m0 ; This version seems faster than sse41 pblendvb
+ pmaxsw m1, m2 ; SSE2 signed word, so it would work for N < 32768/4
+%else
+ pandn m0, m1 ; version for r4q decreasing. It's not faster than real blendv.
+ pmaxsw m0, m2
+ movdqa m1, m0
+%endif
+
+ add r4q, mmsize
+ cmp r4q, Nq
+ jb %%distortion_search
+
+ por m1, mm_const_int32_offsets ; max_idx offset per individual lane (skipped in the inner loop)
+ movdqa m4, m1 ; needed for the aligned y[max_idx]+=2; processing
+
+%if mmsize >= 32
+;merge parallel maximums round 2 (4 vs 4)
+
+ vextractf128 xmm5, ymm3, 1 ; xmm5 = ymm3[1x128] = ymm3[255..128b]
+ cmpps xmm0, xmm3, xmm5, 1 ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] )
+
+ vextracti128 xmm2, ymm1, 1 ; xmm2 = ymm1[1x128] = ymm1[255..128b]
+ emu_blendvps xmm3, xmm5, xmm0 ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128]
+ emu_pblendvb xmm1, xmm2, xmm0 ; p = m0 ? p[1x128] : p[0x128]
+%endif
+
+;merge parallel maximums round 1 (2 vs 2)
+ ; m3=p[3210]
+ movhlps xmm5, xmm3 ; m5=p[xx32]
+ cmpps xmm0, xmm3, xmm5, 1 ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] )
+
+ pshufd xmm2, xmm1, q3232
+ emu_blendvps xmm3, xmm5, xmm0 ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0]
+ emu_pblendvb xmm1, xmm2, xmm0 ; p = m0 ? p[3,2] : p[1,0]
+
+;merge parallel maximums final round (1 vs 1)
+ movaps xmm0, xmm3 ; m0 = m3[0] = p[0]
+ shufps xmm3, xmm3, q1111 ; m3 = m3[1] = p[1]
+ cmpless xmm0, xmm3
+
+ pshufd xmm2, xmm1, q1111
+ emu_pblendvb xmm1, xmm2, xmm0
+
+ movd dword r4d, xmm1 ; zero extends to the rest of r4q
+
+;pextrw r4d,m1,0 ; extract word (16b) from pos0 to r4d
+
+ emu_broadcastss m3, [tmpX + r4q]
+ %{1}ps m7, m3 ; Sxy += X[max_idx]
+
+%if STALL_WRITE_FORWARDING == 0
+ %if SHORT_SYY_UPDATE == 1
+ emu_broadcastss m5, [tmpY + r4q]
+ %{1}ps m6, m5 ; Syy += Y[max_idx]
+ %endif
+ ; this code is big and ugly
+ ; but it prevents stalling a store forwarding
+ ; by writing smaller size and then doing
+ ; big load from same address.
+ ; This helps with N<100 that are the typical case.
+ emu_pbroadcastd m1, xmm1
+ pcmpeqd m1, m1, m4 ; only 1 element matches the address of the new p_max
+
+
+ and r4q, ~(mmsize-1) ; align down, so max_idx is (somewhere) inside mmsize load
+ movaps m5, [tmpY + r4q] ; m5 = Y[y3...ym...y0]
+ movaps m3, mm_const_float_2 ; m3 = [2.0, 2.0, 2.0, 2.0]
+ %if SHORT_SYY_UPDATE == 0
+ movaps m2, m5 ; m2 = m5
+ %endif
+ andps m3, m1 ; m3 = [ 0...2.0...0]
+ %{1}ps m5, m3 ; m5 = Y[y3...ym...y0] +/- [0...2.0...0]
+ movaps [tmpY + r4q], m5 ; Y[max_idx] +-= 2.0;
+
+ %if SHORT_SYY_UPDATE == 0
+ andps m2, m1 ; m2 = Y[0...ym...0]
+ HSUMPS m2, m3 ; m2 = Y[ym, ym, ym, ym];
+ %{1}ps m6, m2 ; Syy += Y[max_idx]
+ %endif
+%else
+ ; this code is shorter, simpler and slower
+ ; because it breaks store forwarding
+ ; it writes smaller size at unaligned address where we later do a bigger aligned load.
+ ; When we have biger vectors (N>100) the store might complete before we try to load it
+ emu_broadcastss m4, [tmpY + r4q]
+ %{1}ps m6, m4 ; Syy += outY[max_idx]
+ %{1}ps m4, mm_const_float_2
+ movss [tmpY + r4q], xmm4 ; Y[max_idx] += 2.0;
+%endif
+
+%endmacro
+
+;
+; Pyramid Vector Quantization Search implementation
+;
+; float * inX - Unaligned (SIMD) access, it will be overread,
+; but extra data will be masked away.
+; int32 * outY - Should be aligned and padded buffer.
+; It would be used as temp buffer.
+; uint32 K - Number of pulses to have after quantizations.
+; uint32 N - Number of vector elements. Must be 0 < N < 256
+;
+%macro PVQ_FAST_SEARCH 0
+cglobal pvq_search,4,6,8, 256*4, inX, outY, K, N
+%define tmpX rsp
+%define tmpY outYq
+
+
+ movaps m0, [const_float_abs_mask]
+ shl Nq, 2 ; N *= sizeof(float)
+ mov r4q, Nq
+
+ neg r4q
+ and r4q, mmsize-1
+
+%ifdef PIC
+ lea r5q, [const_align_abs_edge] ; rip+const
+ movups m3, [r5q + r4q - mmsize]
+%else
+ movups m3, [const_align_abs_edge - mmsize + r4q] ; this is the bit mask for the padded read at the end of the input
+%endif
+
+ add Nq, r4q ; Nq = align(Nq, mmsize)
+; sub rsp, Nq ; allocate tmpX[Nq]
+
+ lea r4q, [Nq - mmsize] ; Nq is rounded up (aligned up) to mmsize, so r4q can't become negative here, unless N=0.
+ movups m2, [inXq + r4q]
+ andps m2, m3
+ movaps [tmpX + r4q], m2
+ movaps m1, m2 ; Sx
+
+align 16
+%%loop_abs_sum:
+ sub r4q, mmsize
+ jc %%end_loop_abs_sum
+
+ movups m2, [inXq + r4q]
+ andps m2, m0
+
+ movaps [tmpX + r4q], m2 ; tmpX[i]=abs(X[i])
+ addps m1, m2
+ jmp %%loop_abs_sum
+
+
+align 16
+%%end_loop_abs_sum
+
+ HSUMPS m1, m2 ; m1 = Sx
+
+ xorps m0, m0
+ comiss xmm0, xmm1 ;
+ jz %%zero_input ; if (Sx==0) goto zero_input
+
+ cvtsi2ss xmm0, dword Kd ; m0 = K
+; rcpss xmm1, xmm1 ; m1 = approx(1/Sx)
+; mulss xmm0, xmm1 ; m0 = K*(1/Sx)
+ divss xmm0, xmm1 ; b = K/Sx
+ ; b = K/max_x
+
+
+ emu_broadcastss m0, xmm0
+
+ lea r4q, [Nq - mmsize]
+ pxor m5, m5 ; Sy ( Sum of y[i] )
+ xorps m6, m6 ; Syy ( Sum of y[i]*y[i] )
+ xorps m7, m7 ; Sxy ( Sum of X[i]*y[i] )
+align 16
+%%loop_guess
+ movaps m1, [tmpX + r4q] ; m1 = X[i]
+ mulps m2, m0, m1 ; m2 = res*X[i]
+%if ALL_FLOAT_PRESEARCH == 1
+ %if PRESEARCH_ROUNDING == 0
+ roundps m2, m2, 0 ; yt = (float)truncf( res*X[i] )
+ %else
+ roundps m2, m2, 3 ; yt = (float)lrintf( res*X[i] )
+ %endif
+ addps m5, m2 ; Sy += yt
+%else
+ %if PRESEARCH_ROUNDING == 0
+ cvttps2dq m2, m2 ; yt = (int)truncf( res*X[i] )
+ %else
+ cvtps2dq m2, m2 ; yt = (int)lrintf( res*X[i] )
+ %endif
+ paddd m5, m2 ; Sy += yt
+ cvtdq2ps m2, m2 ; yt = (float)yt
+%endif
+ mulps m1, m2 ; m1 = X[i]*yt
+ mulps m3, m2, m2 ; m3 = yt*yt
+ addps m7, m1 ; Sxy += m1;
+ addps m6, m3 ; Syy += m3
+ addps m2, m2 ; m2 = 2*yt
+ movaps [tmpY + r4q], m2 ; y[i] = m2
+
+ sub r4q, mmsize
+ jnc %%loop_guess
+; add r4q, mmsize
+; cmp r4q, Nq
+; jb %%loop_guess
+
+
+%if ALL_FLOAT_PRESEARCH == 1
+ HSUMPS m5, m4 ; pulses
+%else
+ HSUMDQ m5, m4 ; pulses
+%endif
+ HSUMPS m6, m1 ; Syy_norm
+ HSUMPS m7, m0 ; Sxy_norm
+
+%if num_mmregs > 8 && CONST_IN_X64_REG_IS_FASTER ; it works faster for me with mem access. ^_^
+ %define mm_const_float_1 m11
+ %define mm_const_float_2 m12
+ %define mm_const_int32_offsets m13
+ %define mm_const_float_N2 m14
+ movaps m11, [const_float_1]
+ movaps m12, [const_float_2]
+ movdqa m13, [const_int32_offsets]
+ movaps m14, [const_float_N2]
+%else
+ %define mm_const_float_1 [const_float_1]
+ %define mm_const_float_2 [const_float_2]
+ %define mm_const_float_N2 [const_float_N2]
+ %define mm_const_int32_offsets [const_int32_offsets]
+%endif
+
+%if ALL_FLOAT_PRESEARCH == 1
+ cvtss2si r4q, xmm5
+%else
+ movd dword r4d, xmm5 ; zero extends to the rest of r4q
+%endif
+
+ sub Kq, r4q ; K -= pulses
+ jc %%remove_pulses_loop
+ jz %%finish
+
+
+align 16
+%%add_pulses_loop:
+ PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm
+
+ sub Kq, 1
+ jnz %%add_pulses_loop
+
+ jmp %%finish
+
+align 16
+%%remove_pulses_loop:
+ PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm
+
+ add Kq, 1
+ jnz %%remove_pulses_loop
+
+
+%%finish:
+align 16
+ lea r4q, [Nq - mmsize]
+
+ movaps m2, [const_float_sign_mask]
+ movaps m4, [const_float_0_5]
+align 16
+%%restore_sign_loop:
+ movaps m0, [tmpY + r4q] ; m0 = Y[i]
+ movups m1, [inXq + r4q] ; m1 = X[i]
+ andps m1, m2 ; m1 = sign(X[i])
+ mulps m0, m4 ; m0 = Y[i]*0.5
+ orps m0, m1 ; m0 = Y[i]*0.5*sign
+ cvtps2dq m3, m0 ; m3 = (int)m0
+ movaps [outYq + r4q], m3
+
+ sub r4q, mmsize
+ jnc %%restore_sign_loop
+%%return
+;release the temp buffer and restore the stack
+; add rsp, Nq ; free tmpX, Nq is the same as the one used to allocate it (after Nq has been aligned up)
+
+%if ARCH_X86_64 == 0 ;sbrdsp
+ movss r0m, xmm6 ; return (float)Syy_norm
+ fld dword r0m
+%else
+ movaps m0, m6 ; return (float)Syy_norm
+%endif
+ RET
+
+align 16
+%%zero_input:
+ lea r4q, [Nq - mmsize]
+ xorps m0, m0
+%%zero_loop:
+ movaps [outYq + r4q], m0
+ sub r4q, mmsize
+ jnc %%zero_loop
+
+ movaps m6, [const_float_1]
+ jmp %%return
+%endmacro
+
+
+INIT_XMM sse2
+PVQ_FAST_SEARCH
+
+INIT_XMM sse42
+PVQ_FAST_SEARCH
+
+INIT_XMM avx
+PVQ_FAST_SEARCH
+
+INIT_YMM avx2
+PVQ_FAST_SEARCH
--
2.13.0