[FFmpeg-devel,WIP] v2 Opus Pyramid Vector Quantization Search in x86 SIMD asm

Message ID	CABA=pqeaQ77Xha79aAdvuP7d4pBLf4B4uLoL3uzpipF-FbBuNw@mail.gmail.com
State	Superseded
Headers	show Delivered-To: ffmpegpatchwork@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; MIME-Version: 1.0 From: Ivan Kalvachev <ikalvachev@gmail.com> Date: Sat, 24 Jun 2017 23:39:03 +0300 Message-ID: <CABA=pqeaQ77Xha79aAdvuP7d4pBLf4B4uLoL3uzpipF-FbBuNw@mail.gmail.com> To: ffmpeg-devel@ffmpeg.org Content-Type: multipart/mixed; boundary="089e082345fc61665c0552bab5a3" Subject: [FFmpeg-devel] [WIP][PATCH]v2 Opus Pyramid Vector Quantization Search in x86 SIMD asm Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

From 7d875390bb81973b6e4d1fb8ad54594ea3f8d5da Mon Sep 17 00:00:00 2001 From: Ivan Kalvachev <ikalvachev@gmail.com> Date: Thu, 8 Jun 2017 22:24:33 +0300 Subject: [PATCH 1/3] SIMD opus pvq_search implementation v2 --- libavcodec/opus_pvq.c | 9 + libavcodec/opus_pvq.h | 5 +- libavcodec/x86/Makefile | 1 + libavcodec/x86/opus_dsp_init.c | 47 +++ libavcodec/x86/opus_pvq_search.asm | 628 +++++++++++++++++++++++++++++++++++++ 5 files changed, 688 insertions(+), 2 deletions(-) create mode 100644 libavcodec/x86/opus_dsp_init.c create mode 100644 libavcodec/x86/opus_pvq_search.asm diff --git a/libavcodec/opus_pvq.c b/libavcodec/opus_pvq.c index 2ac66a0ede..172223e241 100644 --- a/libavcodec/opus_pvq.c +++ b/libavcodec/opus_pvq.c @@ -418,7 +418,13 @@ static uint32_t celt_alg_quant(OpusRangeCoder *rc, float *X, uint32_t N, uint32_ int *y = pvq->qcoeff; celt_exp_rotation(X, N, blocks, K, spread, 1); + + { + START_TIMER gain /= sqrtf(pvq->pvq_search(X, y, K, N)); + STOP_TIMER("pvq_search"); + } + celt_encode_pulses(rc, y, N, K); celt_normalize_residual(y, X, N, gain); celt_exp_rotation(X, N, blocks, K, spread, 0); @@ -947,6 +953,9 @@ int av_cold ff_celt_pvq_init(CeltPVQ **pvq) s->encode_band = pvq_encode_band; s->band_cost = pvq_band_cost; + if (ARCH_X86) + ff_opus_dsp_init_x86(s); + *pvq = s; return 0; diff --git a/libavcodec/opus_pvq.h b/libavcodec/opus_pvq.h index 6691494838..9246337360 100644 --- a/libavcodec/opus_pvq.h +++ b/libavcodec/opus_pvq.h @@ -33,8 +33,8 @@ float *lowband_scratch, int fill) struct CeltPVQ { - DECLARE_ALIGNED(32, int, qcoeff )[176]; - DECLARE_ALIGNED(32, float, hadamard_tmp)[176]; + DECLARE_ALIGNED(32, int, qcoeff )[256]; + DECLARE_ALIGNED(32, float, hadamard_tmp)[256]; float (*pvq_search)(float *X, int *y, int K, int N); @@ -45,6 +45,7 @@ struct CeltPVQ { }; int ff_celt_pvq_init (struct CeltPVQ **pvq); +void ff_opus_dsp_init_x86(struct CeltPVQ *s); void ff_celt_pvq_uninit(struct CeltPVQ **pvq); #endif /* AVCODEC_OPUS_PVQ_H */ diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index b86700b675..bdf250624d 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -52,6 +52,7 @@ OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o x86/synth_filter_init.o OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o +OBJS-$(CONFIG_OPUS_ENCODER) += x86/opus_dsp_init.o x86/opus_pvq_search.o OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp_init.o OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp_init.o diff --git a/libavcodec/x86/opus_dsp_init.c b/libavcodec/x86/opus_dsp_init.c new file mode 100644 index 0000000000..ddccf6fe9e --- /dev/null +++ b/libavcodec/x86/opus_dsp_init.c @@ -0,0 +1,47 @@ +/* + * Opus encoder assembly optimizations + * Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/x86/cpu.h" +#include "libavcodec/opus_pvq.h" + +extern float ff_pvq_search_sse2 (float *X, int *y, int K, int N); +extern float ff_pvq_search_sse42(float *X, int *y, int K, int N); +extern float ff_pvq_search_avx (float *X, int *y, int K, int N); +extern float ff_pvq_search_avx2 (float *X, int *y, int K, int N); + +av_cold void ff_opus_dsp_init_x86(CeltPVQ *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_SSE2(cpu_flags)) + s->pvq_search = ff_pvq_search_sse2; + + if (EXTERNAL_SSE42(cpu_flags)) + s->pvq_search = ff_pvq_search_sse42; + + if (EXTERNAL_AVX(cpu_flags)) + s->pvq_search = ff_pvq_search_avx; + + if (EXTERNAL_AVX2(cpu_flags)) + s->pvq_search = ff_pvq_search_avx2; +} diff --git a/libavcodec/x86/opus_pvq_search.asm b/libavcodec/x86/opus_pvq_search.asm new file mode 100644 index 0000000000..36b679b75e --- /dev/null +++ b/libavcodec/x86/opus_pvq_search.asm @@ -0,0 +1,628 @@ +; Opus encoder assembly optimizations +; Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com> + +%include "config.asm" +%include "libavutil/x86/x86util.asm" + + +%define HADDPS_IS_FAST 0 +%define PHADDD_IS_FAST 0 +%define BLENDVPS_IS_FAST 01 +%define PBLENDVB_IS_FAST 01 +%define CONST_IN_X64_REG_IS_FASTER 01 + +%define STALL_WRITE_FORWARDING 0 +%define SHORT_SYY_UPDATE 01 + +%define USE_APPROXIMATION 01 +%define PRESEARCH_ROUNDING 01 +%define ALL_FLOAT_PRESEARCH cpuflag(sse42) && 0 + + +; +; Horizontal Sum Packed Single precision float +; The resulting sum is in all elements. +; +%macro HSUMPS 2 ; dst, tmp +%if cpuflag(sse42) && HADDPS_IS_FAST + %if sizeof%1>=32 ; avx mmsize=32 + vperm2f128 %2, %1, %1, (0)*16+(1) ; %2=lo(%1)<<128+hi(%1) + addps %1, %2 + %endif ; sse&avx mmsize=16 + haddps %1, %1 ; 5-6c 2/1 + haddps %1, %1 +%elif cpuflag(avx) + %if sizeof%1>=32 ; avx + vperm2f128 %2, %1, %1, (0)*16+(1) ; %2=lo(%1)<<128+hi(%1) + addps %1, %2 + %endif + vshufps %2, %1, %1, q1032 ; %2=[b, a, d, c ] + addps %1, %2 ; %1=[b+d, a+c, d+b, c+a ] + vshufps %2, %1, %1, q0321 ; %2=[c+a, b+d, a+c, d+b ] + addps %1, %2 ; %1=[c+a+b+d, b+d+a+c, a+c+d+b, d+b+c+a ] + +%elif 01 ; this form is a bit faster than the short avx-like emulation. + movaps %2, %1 ; [d, c, b, a ] + shufps %1, %1, q1032 ; %2=[b, a, d, c ] + addps %1, %2 ; %1=[b+d, a+c, d+b, c+a ] + movaps %2, %1 + shufps %1, %1, q0321 ; %2=[c+a, b+d, a+c, d+b ] + addps %1, %2 ; %1=[c+a+b+d, b+d+a+c, a+c+d+b, d+b+c+a ] + ; all %1 members should be equal for as long as float a+b==b+a +%else + %error pick HSUMPS implementation +%endif +%endmacro + + +; +; Horizontal Sum integer Double word +; The resulting sum is in all elements. +; +%macro HSUMDQ 2 ; dst, tmp +%if cpuflag(avx) && notcpuflag(avx2) && sizeof%1>=32 + %error emulating integer 256bit ymm operation with float point op. there will be speed penalty. + vperm2f128 %2, %1, %1, (0)*16+(1) ; %2=lo(%2)<<128+hi(%2) + paddd xmm%1,xmm%2 + phaddd xmm%1,xmm%1 + phaddd xmm%1,xmm%1 + vinsertf128 %1, %1,xmm%1, 1 ; ymm, ymm, xmm, im ; 3c 1/1 +%elif cpuflag(ssse3) && PHADDD_IS_FAST + %if sizeof%1>=32 + vperm2i128 %2, %1, %1, (0)*16+(1) ; %2=lo(%2)<<128+hi(%2) + paddd %1, %2 + %endif ;mmsize == 16 + phaddd %1, %1 ;3 cycles on intel + phaddd %1, %1 + ;6 cycles total (sse) +%elif 01 + %if cpuflag(avx2) && sizeof%1>=32 ;mmsize>=32 + vperm2i128 %2, %1, %1, (0)*16+(1) ; %2=lo(%2)<<128+hi(%2) + paddd %1, %2 + %endif + pshufd %2, %1, q1032 ; %2=[b, a, d, c ] + paddd %1, %2 ; %1=[b+d, a+c, d+b, c+a ] + pshufd %2, %1, q0321 ; %2=[c+a, b+d, a+c, d+b ] + paddd %1, %2 ; %1=[c+a+b+d, b+d+a+c, a+c+d+b, d+b+c+a ] + ;4 cycles total (sse) +%else + %error pick HSUMDQ implementation +%endif +%endmacro + + +; +; Emulate blendvps if not available +; +; src_b destroyed when using emulation with logical operands +; SSE41 blend instruction is hard fixed to use xmm0 as mask +; +%macro emu_blendvps 3 ; dst_a, src_b, mask +%if cpuflag(avx) && BLENDVPS_IS_FAST + blendvps %1, %1, %2, %3 +%elif cpuflag(sse42) && BLENDVPS_IS_FAST + %if notcpuflag(avx) + %ifnidn %3,xmm0 + %error sse41 blendvps uses xmm0 as default 3d operand, you used %3 + %endif + %endif + blendvps %1, %2, %3 +%elif 01 + xorps %2, %1 + andps %2, %3 + xorps %1, %2 +%else + %error pick "blendvps" emulation +%endif +%endmacro + + +; +; Emulate pblendvb if not available +; +; src_b destroyed when using emulation with logical operands +; SSE41 blend instruction is hard fixed to use xmm0 as mask +; +%macro emu_pblendvb 3 ; dst_a, src_b, mask +%if cpuflag(avx2) && PBLENDVB_IS_FAST + pblendvb %1, %1, %2, %3 +;------------------------- +%elif cpuflag(avx) && notcpuflag(avx2) && PBLENDVB_IS_FAST + %if sizeof%1>=32 +;AVX1 does not implement pblendvb that works with ymm +;AVX1 does not implemnt pxor that works with ymm either, +; so no emulation possible +; Use float point blend and hope the penalty is not severe + %error emulating integer 256bit ymm operation with float point op. there will be speed penalty. + vblendvps %1, %1, %2, %3 + %else + vpblendvb %1, %1, %2, %3 + %endif +;------------------------- +%elif cpuflag(sse42) && PBLENDVB_IS_FAST + %ifnidn %3,xmm0 + %error sse41 blendvps uses xmm0 as default 3 operand, you used %3 + %endif + pblendvb %1, %2, %3 +;------------------------- +%elif 01 +;On my cpu, this version is actually faster than pblendvb ! +;pblendvb is much slower than blenvps, go figure! + pxor %2, %1 + pand %2, %3 + pxor %1, %2 +%else + %error pick "pblendvb" emulation +%endif +%endmacro + + +; +; Emulate broadcastss if not available +; +%macro emu_broadcastss 2 ; dst_a, src_b +%if cpuflag(avx2) && 01 + vbroadcastss %1, %2 ; ymm, xmm + +%elif cpuflag(avx) && 01 + %ifnum sizeof%2 ; avx1 register + vpermilps xmm%1, xmm%2, q0000 ; xmm, xmm, imm || ymm, ymm, imm + %if sizeof%1 >= 32 ;mmsize>=32 + vinsertf128 %1, %1,xmm%1, 1 ; ymm, ymm, xmm, im ; 3c 1/1 +; vperm2f128 %1, %1, %1, 0 ; ymm, ymm, ymm ; 2-3c 1/1 + %endif + %else ; avx1 memory + vbroadcastss %1, %2 ; ymm, mm32 || xmm, mm32 + %endif +%elif cpuflag(sse) && 1 + %ifnum sizeof%2 ; sse register + shufps %1, %2, %2, q0000 + %else ; sse memory +; xorps %1, %1 ; this seems to provide minor speedup + movss %1, %2 ; this zeroes the other 3 elements + shufps %1, %1, 0 + %endif +%else + %error invalid op, SSE needed for emulation +%endif +%endmacro + +; +; Emulate pbroadcastd if not available +; +%macro emu_pbroadcastd 2 ; dst_a, src_b +%if cpuflag(avx2) && 01 + vpbroadcastd %1, %2 +%elif cpuflag(avx) && mmsize >= 32 + ;there are no integer operations with 256bit ymm registers on AVX1 + pshufd xmm%1, xmm%2, q0000 ; xmm, xmm, im + vinsertf128 %1, %1,xmm%1, 1 ; ymm, ymm, xmm. im + %error emulating integer 256bit ymm operation with float point op. there will be speed penalty. +%elif cpuflag(sse) + %ifnum sizeof%2 ; sse register + pshufd %1, %2, q0000 + %else ; sse memory +; pxor %1, %1 ; this seems to provide minor speedup. On intel it is direct op, so no micro-ops, letency or throughput. + movd %1, %2 ; movd zero extends + pshufd %1, %1, 0 + %endif +%else + %error invalid op, SSE2 needed +%endif +%endmacro + + +;big ugly hack +%if cpuflag(avx) && notcpuflag(avx2) && mmsize == 32 +; AVX1 does not implement integer operations with 256bit ymm registers +; Some operations however could be emulated with float point operations + %define ALL_FLOAT_PRESEARCH 1 + %define pxor xorps + %define por orps + %define pand andps + %define vextracti128 vextractf128 + %define emu_pblendvb emu_blendvps + %define emu_pbroadcastd emu_broadcastss +%endif + + +SECTION_RODATA 64 + +const_float_abs_mask: times 8 dd 0x7fffffff +const_align_abs_edge: times 8 dd 0 + +const_float_0_5: times 8 dd 0.5 +const_float_1: times 8 dd 1.0 +const_float_2: times 8 dd 2.0 +const_float_N2 times 8 dd -2.0 +const_float_sign_mask: times 8 dd 0x80000000 + +const_int32_offsets: + %rep 8 + dd $-const_int32_offsets + %endrep +SECTION .text + + + + + +%macro PULSES_SEARCH 1 +; m6 Syy_norm +; m7 Sxy_norm + xor r4q, r4q + pxor m1, m1 ; max_idx + +%if USE_APPROXIMATION == 1 + movaps m3, mm_const_float_N2 +%else + xorps m3, m3 ; p_max +%endif + addps m6, mm_const_float_1 ; Syy_norm += 1.0 +align 16 +%%distortion_search: + movd xmm2, dword r4d ; movd zero extends +%ifidn %1,add + movaps m4, [tmpY + r4q] ; y[i] + movaps m5, [tmpX + r4q] ; X[i] + +%if USE_APPROXIMATION == 2 + xorps m0, m0 + cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0) +%endif + + addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm + addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm + +%if USE_APPROXIMATION == 2 + andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding. +%endif + +; addps m4, m6, [tmpY + r4q] ; m4 = Syy_new = Syy_norm + y[i] +; addps m5, m7, [tmpX + r4q] ; m5 = Sxy_new = Sxy_norm + X[i] + +; movaps m4, m6 ; Syy_norm +; movaps m5, m7 ; Sxy_norm +; addps m4, [tmpY + r4q] ; m4 = Syy_new = Syy_norm + y[i] +; addps m5, [tmpX + r4q] ; m5 = Sxy_new = Sxy_norm + X[i] +%else + movaps m5, [tmpY + r4q] ; m5 = y[i] + + xorps m0, m0 ; m0 = 0; + cmpps m0, m0, m5, 1 ; m0 = (0<y) + + subps m4, m6, m5 ; m4 = Syy_new = Syy_norm - y[i] + subps m5, m7, [tmpX + r4q] ; m5 = Sxy_new = Sxy_norm - X[i] + andps m5, m0 ; (0<y)?m5:0 +%endif + +%if USE_APPROXIMATION == 2 + rsqrtps m4, m4 + mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) ) +%elif USE_APPROXIMATION == 1 + mulps m5, m5 + subps m5, m4 ; This hacked formula improves precision, to compensate for the approximation. + rcpps m4, m4 + mulps m5, m4 ; m5 = p = (Sxy_new*Sxy_new - Syy)*approx(1/Syy) +%else + mulps m5, m5 + divps m5, m4 ; m5 = p = Sxy_new*Sxy_new/Syy +%endif + emu_pbroadcastd m2, xmm2 ; m2=i (all lanes get same values, we add the offset-per-lane, later) + + cmpps m0, m3, m5, 1 ; m0 = (m3 < m5) ; (p_max < p) ; (p > p_max) + maxps m3, m5 ; m3=max(p_max,p) + ; maxps here is faster than blendvps, despite blend having lower latency. + ; maxps is also shorter so the inner loop fits in icache segment +; emu_blendvps m3, m5, m0 ; m3 = m0 ? p : p_max + ; SSE4.1 is using hardcoded xmm0 register as third argument +%if cpuflag(avx) && notcpuflag(avx2) && mmsize == 32 + emu_pblendvb m1, m2, m0 +%elif 01 + pand m2, m0 ; This version seems faster than sse41 pblendvb + pmaxsw m1, m2 ; SSE2 signed word, so it would work for N < 32768/4 +%else + pandn m0, m1 ; version for r4q decreasing. It's not faster than real blendv. + pmaxsw m0, m2 + movdqa m1, m0 +%endif + + add r4q, mmsize + cmp r4q, Nq + jb %%distortion_search + + por m1, mm_const_int32_offsets ; max_idx offset per individual lane (skipped in the inner loop) + movdqa m4, m1 ; needed for the aligned y[max_idx]+=2; processing + +%if mmsize >= 32 +;merge parallel maximums round 2 (4 vs 4) + + vextractf128 xmm5, ymm3, 1 ; xmm5 = ymm3[1x128] = ymm3[255..128b] + cmpps xmm0, xmm3, xmm5, 1 ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] ) + + vextracti128 xmm2, ymm1, 1 ; xmm2 = ymm1[1x128] = ymm1[255..128b] + emu_blendvps xmm3, xmm5, xmm0 ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128] + emu_pblendvb xmm1, xmm2, xmm0 ; p = m0 ? p[1x128] : p[0x128] +%endif + +;merge parallel maximums round 1 (2 vs 2) + ; m3=p[3210] + movhlps xmm5, xmm3 ; m5=p[xx32] + cmpps xmm0, xmm3, xmm5, 1 ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] ) + + pshufd xmm2, xmm1, q3232 + emu_blendvps xmm3, xmm5, xmm0 ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0] + emu_pblendvb xmm1, xmm2, xmm0 ; p = m0 ? p[3,2] : p[1,0] + +;merge parallel maximums final round (1 vs 1) + movaps xmm0, xmm3 ; m0 = m3[0] = p[0] + shufps xmm3, xmm3, q1111 ; m3 = m3[1] = p[1] + cmpless xmm0, xmm3 + + pshufd xmm2, xmm1, q1111 + emu_pblendvb xmm1, xmm2, xmm0 + + movd dword r4d, xmm1 ; zero extends to the rest of r4q + +;pextrw r4d,m1,0 ; extract word (16b) from pos0 to r4d + + emu_broadcastss m3, [tmpX + r4q] + %{1}ps m7, m3 ; Sxy += X[max_idx] + +%if STALL_WRITE_FORWARDING == 0 + %if SHORT_SYY_UPDATE == 1 + emu_broadcastss m5, [tmpY + r4q] + %{1}ps m6, m5 ; Syy += Y[max_idx] + %endif + ; this code is big and ugly + ; but it prevents stalling a store forwarding + ; by writing smaller size and then doing + ; big load from same address. + ; This helps with N<100 that are the typical case. + emu_pbroadcastd m1, xmm1 + pcmpeqd m1, m1, m4 ; only 1 element matches the address of the new p_max + + + and r4q, ~(mmsize-1) ; align down, so max_idx is (somewhere) inside mmsize load + movaps m5, [tmpY + r4q] ; m5 = Y[y3...ym...y0] + movaps m3, mm_const_float_2 ; m3 = [2.0, 2.0, 2.0, 2.0] + %if SHORT_SYY_UPDATE == 0 + movaps m2, m5 ; m2 = m5 + %endif + andps m3, m1 ; m3 = [ 0...2.0...0] + %{1}ps m5, m3 ; m5 = Y[y3...ym...y0] +/- [0...2.0...0] + movaps [tmpY + r4q], m5 ; Y[max_idx] +-= 2.0; + + %if SHORT_SYY_UPDATE == 0 + andps m2, m1 ; m2 = Y[0...ym...0] + HSUMPS m2, m3 ; m2 = Y[ym, ym, ym, ym]; + %{1}ps m6, m2 ; Syy += Y[max_idx] + %endif +%else + ; this code is shorter, simpler and slower + ; because it breaks store forwarding + ; it writes smaller size at unaligned address where we later do a bigger aligned load. + ; When we have biger vectors (N>100) the store might complete before we try to load it + emu_broadcastss m4, [tmpY + r4q] + %{1}ps m6, m4 ; Syy += outY[max_idx] + %{1}ps m4, mm_const_float_2 + movss [tmpY + r4q], xmm4 ; Y[max_idx] += 2.0; +%endif + +%endmacro + +; +; Pyramid Vector Quantization Search implementation +; +; float * inX - Unaligned (SIMD) access, it will be overread, +; but extra data will be masked away. +; int32 * outY - Should be aligned and padded buffer. +; It would be used as temp buffer. +; uint32 K - Number of pulses to have after quantizations. +; uint32 N - Number of vector elements. Must be 0 < N < 256 +; +%macro PVQ_FAST_SEARCH 0 +cglobal pvq_search,4,6,8, 256*4, inX, outY, K, N +%define tmpX rsp +%define tmpY outYq + + + movaps m0, [const_float_abs_mask] + shl Nq, 2 ; N *= sizeof(float) + mov r4q, Nq + + neg r4q + and r4q, mmsize-1 + +%ifdef PIC + lea r5q, [const_align_abs_edge] ; rip+const + movups m3, [r5q + r4q - mmsize] +%else + movups m3, [const_align_abs_edge - mmsize + r4q] ; this is the bit mask for the padded read at the end of the input +%endif + + add Nq, r4q ; Nq = align(Nq, mmsize) +; sub rsp, Nq ; allocate tmpX[Nq] + + lea r4q, [Nq - mmsize] ; Nq is rounded up (aligned up) to mmsize, so r4q can't become negative here, unless N=0. + movups m2, [inXq + r4q] + andps m2, m3 + movaps [tmpX + r4q], m2 + movaps m1, m2 ; Sx + +align 16 +%%loop_abs_sum: + sub r4q, mmsize + jc %%end_loop_abs_sum + + movups m2, [inXq + r4q] + andps m2, m0 + + movaps [tmpX + r4q], m2 ; tmpX[i]=abs(X[i]) + addps m1, m2 + jmp %%loop_abs_sum + + +align 16 +%%end_loop_abs_sum + + HSUMPS m1, m2 ; m1 = Sx + + xorps m0, m0 + comiss xmm0, xmm1 ; + jz %%zero_input ; if (Sx==0) goto zero_input + + cvtsi2ss xmm0, dword Kd ; m0 = K +; rcpss xmm1, xmm1 ; m1 = approx(1/Sx) +; mulss xmm0, xmm1 ; m0 = K*(1/Sx) + divss xmm0, xmm1 ; b = K/Sx + ; b = K/max_x + + + emu_broadcastss m0, xmm0 + + lea r4q, [Nq - mmsize] + pxor m5, m5 ; Sy ( Sum of y[i] ) + xorps m6, m6 ; Syy ( Sum of y[i]*y[i] ) + xorps m7, m7 ; Sxy ( Sum of X[i]*y[i] ) +align 16 +%%loop_guess + movaps m1, [tmpX + r4q] ; m1 = X[i] + mulps m2, m0, m1 ; m2 = res*X[i] +%if ALL_FLOAT_PRESEARCH == 1 + %if PRESEARCH_ROUNDING == 0 + roundps m2, m2, 0 ; yt = (float)truncf( res*X[i] ) + %else + roundps m2, m2, 3 ; yt = (float)lrintf( res*X[i] ) + %endif + addps m5, m2 ; Sy += yt +%else + %if PRESEARCH_ROUNDING == 0 + cvttps2dq m2, m2 ; yt = (int)truncf( res*X[i] ) + %else + cvtps2dq m2, m2 ; yt = (int)lrintf( res*X[i] ) + %endif + paddd m5, m2 ; Sy += yt + cvtdq2ps m2, m2 ; yt = (float)yt +%endif + mulps m1, m2 ; m1 = X[i]*yt + mulps m3, m2, m2 ; m3 = yt*yt + addps m7, m1 ; Sxy += m1; + addps m6, m3 ; Syy += m3 + addps m2, m2 ; m2 = 2*yt + movaps [tmpY + r4q], m2 ; y[i] = m2 + + sub r4q, mmsize + jnc %%loop_guess +; add r4q, mmsize +; cmp r4q, Nq +; jb %%loop_guess + + +%if ALL_FLOAT_PRESEARCH == 1 + HSUMPS m5, m4 ; pulses +%else + HSUMDQ m5, m4 ; pulses +%endif + HSUMPS m6, m1 ; Syy_norm + HSUMPS m7, m0 ; Sxy_norm + +%if num_mmregs > 8 && CONST_IN_X64_REG_IS_FASTER ; it works faster for me with mem access. ^_^ + %define mm_const_float_1 m11 + %define mm_const_float_2 m12 + %define mm_const_int32_offsets m13 + %define mm_const_float_N2 m14 + movaps m11, [const_float_1] + movaps m12, [const_float_2] + movdqa m13, [const_int32_offsets] + movaps m14, [const_float_N2] +%else + %define mm_const_float_1 [const_float_1] + %define mm_const_float_2 [const_float_2] + %define mm_const_float_N2 [const_float_N2] + %define mm_const_int32_offsets [const_int32_offsets] +%endif + +%if ALL_FLOAT_PRESEARCH == 1 + cvtss2si r4q, xmm5 +%else + movd dword r4d, xmm5 ; zero extends to the rest of r4q +%endif + + sub Kq, r4q ; K -= pulses + jc %%remove_pulses_loop + jz %%finish + + +align 16 +%%add_pulses_loop: + PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm + + sub Kq, 1 + jnz %%add_pulses_loop + + jmp %%finish + +align 16 +%%remove_pulses_loop: + PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm + + add Kq, 1 + jnz %%remove_pulses_loop + + +%%finish: +align 16 + lea r4q, [Nq - mmsize] + + movaps m2, [const_float_sign_mask] + movaps m4, [const_float_0_5] +align 16 +%%restore_sign_loop: + movaps m0, [tmpY + r4q] ; m0 = Y[i] + movups m1, [inXq + r4q] ; m1 = X[i] + andps m1, m2 ; m1 = sign(X[i]) + mulps m0, m4 ; m0 = Y[i]*0.5 + orps m0, m1 ; m0 = Y[i]*0.5*sign + cvtps2dq m3, m0 ; m3 = (int)m0 + movaps [outYq + r4q], m3 + + sub r4q, mmsize + jnc %%restore_sign_loop +%%return +;release the temp buffer and restore the stack +; add rsp, Nq ; free tmpX, Nq is the same as the one used to allocate it (after Nq has been aligned up) + +%if ARCH_X86_64 == 0 ;sbrdsp + movss r0m, xmm6 ; return (float)Syy_norm + fld dword r0m +%else + movaps m0, m6 ; return (float)Syy_norm +%endif + RET + +align 16 +%%zero_input: + lea r4q, [Nq - mmsize] + xorps m0, m0 +%%zero_loop: + movaps [outYq + r4q], m0 + sub r4q, mmsize + jnc %%zero_loop + + movaps m6, [const_float_1] + jmp %%return +%endmacro + + +INIT_XMM sse2 +PVQ_FAST_SEARCH + +INIT_XMM sse42 +PVQ_FAST_SEARCH + +INIT_XMM avx +PVQ_FAST_SEARCH + +INIT_YMM avx2 +PVQ_FAST_SEARCH -- 2.13.0

[FFmpeg-devel,WIP] v2 Opus Pyramid Vector Quantization Search in x86 SIMD asm

Commit Message

Comments

Patch