Message ID | CABA=pqcmfV7+xZVNzcvgvHaFbXoOpkuhbTq4jkwzxkGg+i6pAw@mail.gmail.com |
---|---|
State | Superseded |
Headers | show |
On 26 July 2017 at 15:56, Ivan Kalvachev <ikalvachev@gmail.com> wrote: > + if (ARCH_X86 && CONFIG_OPUS_ENCODER) > + ff_opus_dsp_init_x86(s); > Just change it to + if (ARCH_X86) The init function is named opus_dsp, so it'll get used to other opus things, not just the encoder. The assembly code looks fine to me, but other people will have to take a look at it in case I'm missing something.
On 7/27/17, Rostislav Pehlivanov <atomnuker@gmail.com> wrote: > On 26 July 2017 at 15:56, Ivan Kalvachev <ikalvachev@gmail.com> wrote: > >> + if (ARCH_X86 && CONFIG_OPUS_ENCODER) >> + ff_opus_dsp_init_x86(s); >> > > Just change it to > + if (ARCH_X86) > > The init function is named opus_dsp, so it'll get used to other opus > things, not just the encoder. But at the moment it does not. I do prefer to leave that task for the one that adds opus decoder functions. Also this change alone would break compilation, since it also requires changing the libavcodec/x86/Makefile and adding the guard inside the opus_dsp_init.c Another option is to have "opus_enc_dsp_init.c" and call the function "ff_opus_enc_dsp_init_x86()". Do tell me which option do you prefer and do you insist on v7 just for that. > The assembly code looks fine to me, but other people will have to take a > look at it in case I'm missing something.
On 27 July 2017 at 09:38, Ivan Kalvachev <ikalvachev@gmail.com> wrote: > On 7/27/17, Rostislav Pehlivanov <atomnuker@gmail.com> wrote: > > On 26 July 2017 at 15:56, Ivan Kalvachev <ikalvachev@gmail.com> wrote: > > > >> + if (ARCH_X86 && CONFIG_OPUS_ENCODER) > >> + ff_opus_dsp_init_x86(s); > >> > > > > Just change it to > > + if (ARCH_X86) > > > > The init function is named opus_dsp, so it'll get used to other opus > > things, not just the encoder. > > But at the moment it does not. > I do prefer to leave that task for the one that > adds opus decoder functions. > > Also this change alone would break compilation, since > it also requires changing the libavcodec/x86/Makefile > and adding the guard inside the opus_dsp_init.c > > Another option is to have "opus_enc_dsp_init.c" and call > the function "ff_opus_enc_dsp_init_x86()". > > Do tell me which option do you prefer > and do you insist on v7 just for that. > > > The assembly code looks fine to me, but other people will have to take a > > look at it in case I'm missing something. > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel > The former, but that can be changed later after pushing
On Wed, Jul 26, 2017 at 4:56 PM, Ivan Kalvachev <ikalvachev@gmail.com> wrote: > +++ b/libavcodec/x86/opus_pvq_search.asm Generic minor stuff: Use rN instead of rNq for numbered registers (q suffix is used for named args only due to preprocessor limitations). Use the same "standard" vertical alignment rules as most existing code, e.g. instructions indented by 4 spaces and operands aligned on the first comma. Use xmN instead of xmmN (only really makes a difference when SWAP:s are used, but might as well do it "correctly"). Use 32-bit operands, e.g. rNd, when 64-bit math isn't required. Unless aligning every single loop entry helps a lot I'd avoid it since it does waste a bit of icache. Explicitly using the carry flag as a branch condition is a bit weird. Generally using jg/jl/jge/jle tends to be clearer. > +%ifdef __NASM_VER__ > +%use "smartalign" > +ALIGNMODE p6 > +%endif Assembler-specific ifdeffery should be avoided in individual files. Something equivalent already exists in x86inc actually but I don't remember if it was merged to FFmpeg from upstream (x264) yet. > +const_int32_offsets: > + %rep 8 > + dd $-const_int32_offsets > + %endrep Isn't this just an overly complicated way of expressing "dd 0*4, 1*4, 2*4, 3*4, 4*4, 5*4, 6*4, 7*4"? > +SECTION .text > + > + > + > + Reduce some of the excessive whitespace. > +%macro HSUMPS 2 ; dst/src, tmp > +%if cpuflag(avx) > + %if sizeof%1>=32 ; avx > + vperm2f128 %2, %1, %1, (0)*16+(1) ; %2=lo(%1)<<128+hi(%1) > + vaddps %1, %2 > + %endif > + vshufps %2, %1, %1, q1032 > + vaddps %1, %2 > + vshufps %2, %1, %1, q0321 > + vaddps %1, %2 > + > +%else ; this form is a bit faster than the short avx-like emulation. > + movaps %2, %1 ; [d, c, b, a ] > + shufps %1, %1, q1032 ; %2=[b, a, d, c ] > + addps %1, %2 ; %1=[b+d, a+c, d+b, c+a ] > + movaps %2, %1 > + shufps %1, %1, q0321 ; %2=[c+a, b+d, a+c, d+b ] > + addps %1, %2 ; %1=[c+a+b+d, b+d+a+c, a+c+d+b, d+b+c+a ] > + ; all %1 members should be equal for as long as float a+b==b+a > +%endif > +%endmacro Is reordering moves for the non-AVX path worth the additional complexity? Microoptimizations that only affect legacy hardware are IMO a bit questionable. > +%macro EMU_pblendvb 3 ; dst/src_a, src_b, mask > +%if cpuflag(avx) > + %if cpuflag(avx) && notcpuflag(avx2) && sizeof%1 >= 32 > + %error AVX1 does not support integer 256 bit ymm operations > + %endif > + > + vpblendvb %1, %1, %2, %3 > +;------------------------- > +%elif cpuflag(sse4) > + %ifnidn %3,xmm0 > + %error sse41 blendvps uses xmm0 as default 3 operand, you used %3 > + %endif > + pblendvb %1, %2, %3 > +;------------------------- > +%else > + pxor %2, %1 > + pand %2, %3 > + pxor %1, %2 > +%endif > +%endmacro The AVX1 integer warning is kind of silly and doesn't really serve any purpose. > +%macro EMU_broadcastss 2 ; dst, src > +%if cpuflag(avx2) > + vbroadcastss %1, %2 ; ymm, xmm > +;------------------------- > +%elif cpuflag(avx) > + %ifnum sizeof%2 ; avx1 register > + vpermilps xmm%1, xmm%2, q0000 ; xmm, xmm, imm || ymm, ymm, imm > + %if sizeof%1 >= 32 ; mmsize>=32 > + vinsertf128 %1, %1,xmm%1, 1 ; ymm, ymm, xmm, im > + %endif > + %else ; avx1 memory > + vbroadcastss %1, %2 ; ymm, mm32 || xmm, mm32 > + %endif > +;------------------------- > +%else > + %ifnum sizeof%2 ; sse register > + shufps %1, %2, %2, q0000 > + %else ; sse memory > + movss %1, %2 ; this zeroes the other 3 elements > + shufps %1, %1, 0 > + %endif > +%endif > +%endmacro Use the existing x86util VBROADCASTSS macro. Modify it if it's lacking something. +%macro EMU_pbroadcastd 2 ; dst, src +%if cpuflag(avx2) + vpbroadcastd %1, %2 +%elif cpuflag(avx) && mmsize >= 32 + %error AVX1 does not support integer 256 bit ymm operations +%else + %ifnum sizeof%2 ; sse2 register + pshufd %1, %2, q0000 + %else ; sse memory + movd %1, %2 ; movd zero extends + pshufd %1, %1, 0 + %endif +%endif +%endmacro Same as above, but using the SPLATD macro. > +; Merge parallel maximums final round (1 vs 1) > + movaps xmm0, xmm3 ; m0 = m3[0] = p[0] > + shufps xmm3, xmm3, q1111 ; m3 = m3[1] = p[1] > + cmpless xmm0, xmm3 Use 3-arg instead of reg-reg movaps. Use the actual cmpss instruction instead of psuedo-instructions. x86inc doesn't support automatic VEX handling for every possible psuedo-instruction (mainly because there's so many of them) which will result in AVX state transitions if INIT_YMM is used. > +%macro SET_PIC_BASE 3; reg, const_label > +%ifdef PIC > + %{1} %2, [%3] ; lea r5q, [rip+const] > + %define pic_base_%3 %2 > +%else > + %define pic_base_%3 %3 > +%endif > +%endmacro [...] > + SET_PIC_BASE lea, r5q, const_align_abs_edge ; rip+const > + movups m3, [pic_base_const_align_abs_edge + r4q - mmsize] This macro is only ever used once, remove and inline it to reduce complexity. > +%if num_mmregs > 8 > +%define num_hireg_used 3 > +%endif IIRC the number of used mmregs is ignored in 32-bit mode so I believe you can unconditionally specify 11. > + lea r4q, [Nq - mmsize] ; Nq is rounded up (aligned up) to mmsize, so r4q can't become negative here, unless N=0. > + movups m2, [inXq + r4q] > + andps m2, m3 > + movaps [tmpX + r4q], m2 > + movaps m1, m2 ; Sx Redundant reg-reg movaps, use m1 directly instead of using m2 at all. > + %if PRESEARCH_ROUNDING == 0 > + cvttps2dq m2, m2 ; yt = (int)truncf( res*X[i] ) > + %else > + cvtps2dq m2, m2 ; yt = (int)lrintf( res*X[i] ) > + %endif > + paddd m5, m2 ; Sy += yt > + cvtdq2ps m2, m2 ; yt = (float)yt Would using roundps instead of doing float-int-float conversion be beneficial? > + mulps m1, m2 ; m1 = X[i]*yt > + movaps [tmpY + r4q], m2 ; y[i] = m2 > + addps m7, m1 ; Sxy += m1; > + mulps m2, m2 ; m2 = yt*yt > + addps m6, m2 ; Syy += m2 Potential use case for 2x FMA instead of 2x mul+add. > +%%restore_sign_loop: > + movaps m0, [tmpY + r4q] ; m0 = Y[i] > + movups m1, [inXq + r4q] ; m1 = X[i] > + andps m1, m2 ; m1 = sign(X[i]) > + orps m0, m1 ; m0 = Y[i]*sign > + cvtps2dq m3, m0 ; m3 = (int)m0 > + movaps [outYq + r4q], m3 The orps instruction could be done using a memory operand instead of movaps+orps. AVX supports unaligned memory operands so the same could conditionally be done for the andps as well but that's a bit of a nit. > +%if ARCH_X86_64 == 0 ;sbrdsp > + movss r0m, xmm6 ; return (float)Syy_norm > + fld dword r0m > +%else > + movaps m0, m6 ; return (float)Syy_norm > +%endif > + RET The reg-reg movaps on x64 could be optimized away by reordering registers so that the return value ends up in m0 instead of m6 from the start. > +INIT_XMM avx The code has a bunch of if conditions for ymm regs but ymm regs are never actually used?
On 7/31/17, Henrik Gramner <henrik@gramner.com> wrote: > On Wed, Jul 26, 2017 at 4:56 PM, Ivan Kalvachev <ikalvachev@gmail.com> > wrote: >> +++ b/libavcodec/x86/opus_pvq_search.asm > > Generic minor stuff: > > Use rN instead of rNq for numbered registers (q suffix is used for > named args only due to preprocessor limitations). Done. Is this documented? > Use the same "standard" vertical alignment rules as most existing > code, e.g. instructions indented by 4 spaces and operands aligned on > the first comma. What do you mean operands aligned on the first comma? The longest operand I had is "xmm0" , counting comma and space I get 6 position alignment for operands. Now, with "xm7" i can lower this to 5 position. Should I do that? (I don't have "xm15" ). Also, I've aligned the operand at 12 positions after their instruction start, because this is the size of the longest real instruction. As for why i have instruction start at 8'th position. It's because I've allocated the field at position 4-7 for instruction prefixes, and the "EMU_" is 4 positions. Now I have: 12345678____________12345_12345_12345_ EMU_broadcastss ym13, xm10 EMU_blendvps xm1, xm2, m3 vblendvps blendvps I can ditch the prefix and shorten the operands. e.g. : 1234_____________1234_1234_1234_ VBROADCASTSS ym1, xm1 BLENDVPS m1, m2, m3 Is that acceptable? Or maybe you do mean: 1234_____________1234_1234_123 VBROADCASTSS ym1, xm1 BLENDVPS m1, m2, m3 However I would prefer to use 1234_____________1234_1234_123 VBROADCASTSS ym1, xm1 BLENDVPS m1, m2, m3 PBLENDVD xm1, xm2, xm3 (You do need fixed width font to see that correctly). I'll wait for reply before doing that. > Use xmN instead of xmmN (only really makes a difference when SWAP:s > are used, but might as well do it "correctly"). Done. > Use 32-bit operands, e.g. rNd, when 64-bit math isn't required. Done > Unless aligning every single loop entry helps a lot I'd avoid it since > it does waste a bit of icache. I'll redo my benchmarks, but I have placed these aligns for a reason. At some point removed debug instructions started making my code slower. So I've placed align to avoid random slowdowns. > Explicitly using the carry flag as a branch condition is a bit weird. > Generally using jg/jl/jge/jle tends to be clearer. I use it to take advantage of the so called MacroFusion. It allows the merge of cmp and jxx, as long as the branch checks only one flag, so all signed branches are to be avoided (as stated by intel manual). The newer the cpu, the more opcodes (add/sub) could be merged and less limitations. >> +%ifdef __NASM_VER__ >> +%use "smartalign" >> +ALIGNMODE p6 >> +%endif > > Assembler-specific ifdeffery should be avoided in individual files. > Something equivalent already exists in x86inc actually but I don't > remember if it was merged to FFmpeg from upstream (x264) yet. Definitely not merged. I hear a lot about the improved x86inc, maybe it is time for you to merge it in FFmpeg? >> +const_int32_offsets: >> + %rep 8 >> + dd $-const_int32_offsets >> + %endrep > > Isn't this just an overly complicated way of expressing "dd 0*4, 1*4, > 2*4, 3*4, 4*4, 5*4, 6*4, 7*4"? Yes. Do you know a way that works with "times 8"? I've done it this way to make it easy to change the size of the constant. Do you request I change it? >> +SECTION .text >> + >> + >> + >> + > > Reduce some of the excessive whitespace. Will do with the other cosmetics. >> +%macro HSUMPS 2 ; dst/src, tmp >> +%if cpuflag(avx) >> + %if sizeof%1>=32 ; avx >> + vperm2f128 %2, %1, %1, (0)*16+(1) ; >> %2=lo(%1)<<128+hi(%1) >> + vaddps %1, %2 >> + %endif >> + vshufps %2, %1, %1, q1032 >> + vaddps %1, %2 >> + vshufps %2, %1, %1, q0321 >> + vaddps %1, %2 >> + >> +%else ; this form is a bit faster than the short avx-like emulation. >> + movaps %2, %1 ; [d, c, b, >> a ] >> + shufps %1, %1, q1032 ; %2=[b, a, d, >> c ] >> + addps %1, %2 ; %1=[b+d, a+c, d+b, >> c+a ] >> + movaps %2, %1 >> + shufps %1, %1, q0321 ; %2=[c+a, b+d, a+c, >> d+b ] >> + addps %1, %2 ; %1=[c+a+b+d, b+d+a+c, a+c+d+b, >> d+b+c+a ] >> + ; all %1 members should be equal for as long as float a+b==b+a >> +%endif >> +%endmacro > > Is reordering moves for the non-AVX path worth the additional > complexity? Microoptimizations that only affect legacy hardware are > IMO a bit questionable. Yes, I'm on "legacy" hardware and the improvement is reliably measurable. >> +%macro EMU_pblendvb 3 ; dst/src_a, src_b, mask >> +%if cpuflag(avx) >> + %if cpuflag(avx) && notcpuflag(avx2) && sizeof%1 >= 32 >> + %error AVX1 does not support integer 256 bit ymm operations >> + %endif >> + >> + vpblendvb %1, %1, %2, %3 >> +;------------------------- >> +%elif cpuflag(sse4) >> + %ifnidn %3,xmm0 >> + %error sse41 blendvps uses xmm0 as default 3 operand, you used %3 >> + %endif >> + pblendvb %1, %2, %3 >> +;------------------------- >> +%else >> + pxor %2, %1 >> + pand %2, %3 >> + pxor %1, %2 >> +%endif >> +%endmacro > > The AVX1 integer warning is kind of silly and doesn't really serve any > purpose. As I've said in this thread before, If you use avx1 ymm with this macro you won't get any warning that avx2 opcode has been generated because x86inc does not overload avx2. I've tested it. (And again, merge the improved avx2 x86inc) >> +%macro EMU_broadcastss 2 ; dst, src >> +%if cpuflag(avx2) >> + vbroadcastss %1, %2 ; ymm, xmm >> +;------------------------- >> +%elif cpuflag(avx) >> + %ifnum sizeof%2 ; avx1 register >> + vpermilps xmm%1, xmm%2, q0000 ; xmm, xmm, imm || ymm, ymm, imm >> + %if sizeof%1 >= 32 ; mmsize>=32 >> + vinsertf128 %1, %1,xmm%1, 1 ; ymm, ymm, xmm, im >> + %endif >> + %else ; avx1 memory >> + vbroadcastss %1, %2 ; ymm, mm32 || xmm, mm32 >> + %endif >> +;------------------------- >> +%else >> + %ifnum sizeof%2 ; sse register >> + shufps %1, %2, %2, q0000 >> + %else ; sse memory >> + movss %1, %2 ; this zeroes the other 3 >> elements >> + shufps %1, %1, 0 >> + %endif >> +%endif >> +%endmacro > > Use the existing x86util VBROADCASTSS macro. Modify it if it's lacking > something. The x86util VBROADCASTSS macro implements only the avx1 variant that is memory to register. My variant emulates the avx2 variant, that also does reg to reg. My variant also avoids write dependency on "movss reg, reg". ("movss reg, mem" clears the other elements, but "movss reg, reg" preserves them. This creates dependency on the old values of the register.) And yes, I did ask if I should separate these macros in another file and if I should try to merge them in x86util.asm first. What do you think on the matter? > +%macro EMU_pbroadcastd 2 ; dst, src > +%if cpuflag(avx2) > + vpbroadcastd %1, %2 > +%elif cpuflag(avx) && mmsize >= 32 > + %error AVX1 does not support integer 256 bit ymm operations > +%else > + %ifnum sizeof%2 ; sse2 register > + pshufd %1, %2, q0000 > + %else ; sse memory > + movd %1, %2 ; movd zero extends > + pshufd %1, %1, 0 > + %endif > +%endif > +%endmacro > > Same as above, but using the SPLATD macro. Again, SPLATD does not handle avx2. Also I do think it is better to use a name of existing instruction instead of a made up one. >> +; Merge parallel maximums final round (1 vs 1) >> + movaps xmm0, xmm3 ; m0 = m3[0] = p[0] >> + shufps xmm3, xmm3, q1111 ; m3 = m3[1] = p[1] >> + cmpless xmm0, xmm3 > > Use 3-arg instead of reg-reg movaps. Do you have that in mind: shufps xm0, xm3,xm3, q1111 cmpnless xm0, xm3 (seems to work fine). Done. > Use the actual cmpss instruction instead of psuedo-instructions. > x86inc doesn't support automatic VEX handling for every possible > psuedo-instruction (mainly because there's so many of them) which will > result in AVX state transitions if INIT_YMM is used. Not according to Intel Architecture Code Analyzer and the benchmarks I've got. The benchmarks showed that using cmpps is about 5% slower on Ryzen and absolutely same speed on SkyLake. That's in avx2 ymm. Still I'll change it. Done. >> +%macro SET_PIC_BASE 3; reg, const_label >> +%ifdef PIC >> + %{1} %2, [%3] ; lea r5q, [rip+const] >> + %define pic_base_%3 %2 >> +%else >> + %define pic_base_%3 %3 >> +%endif >> +%endmacro > [...] >> + SET_PIC_BASE lea, r5q, const_align_abs_edge ; rip+const >> + movups m3, [pic_base_const_align_abs_edge + r4q - mmsize] > > This macro is only ever used once, remove and inline it to reduce > complexity. Can I move it to x86util too? Unified handling of PIC mangling is something welcomed. I do think this is the right way. ;) I think that at least one other file had something very similar. I find explicit checks with ifdef/else/endif a lot more uglier and hard to follow. >> +%if num_mmregs > 8 >> +%define num_hireg_used 3 >> +%endif > > IIRC the number of used mmregs is ignored in 32-bit mode so I believe > you can unconditionally specify 11. Is that documented and guaranteed to not change in future? I had it at 11 unconditionally. But then decided to play safe. >> + lea r4q, [Nq - mmsize] ; Nq is rounded up (aligned up) >> to mmsize, so r4q can't become negative here, unless N=0. >> + movups m2, [inXq + r4q] >> + andps m2, m3 >> + movaps [tmpX + r4q], m2 >> + movaps m1, m2 ; Sx > > Redundant reg-reg movaps, use m1 directly instead of using m2 at all. Nice catch Done. >> + %if PRESEARCH_ROUNDING == 0 >> + cvttps2dq m2, m2 ; yt = (int)truncf( res*X[i] ) >> + %else >> + cvtps2dq m2, m2 ; yt = (int)lrintf( res*X[i] ) >> + %endif >> + paddd m5, m2 ; Sy += yt >> + cvtdq2ps m2, m2 ; yt = (float)yt > > Would using roundps instead of doing float-int-float conversion be > beneficial? Not on any CPU I got benchmarks from (And this includes Skylake and Ryzen). When I rant about Intel making ops that are slower than their multi op equivalents... this is one of my prime examples. >> + mulps m1, m2 ; m1 = X[i]*yt >> + movaps [tmpY + r4q], m2 ; y[i] = m2 >> + addps m7, m1 ; Sxy += m1; >> + mulps m2, m2 ; m2 = yt*yt >> + addps m6, m2 ; Syy += m2 > > Potential use case for 2x FMA instead of 2x mul+add. Hum... FMULADD_PS m7, m1, m2, m7, m1 +mulps m1, m1, m2 +addps m7, m7, m1 FMULADD_PS m6, m2, m2, m6, m2 +mulps m2, m2, m2 +addps m2, m6, m2 FMA is newer than AVX1. To actually use the opcodes I'd need to have 4'th version just for it or use avx2. Without possibility of using FMA, the macro usage would be just an obfuscation. Do you insist on it? >> +%%restore_sign_loop: >> + movaps m0, [tmpY + r4q] ; m0 = Y[i] >> + movups m1, [inXq + r4q] ; m1 = X[i] >> + andps m1, m2 ; m1 = sign(X[i]) >> + orps m0, m1 ; m0 = Y[i]*sign >> + cvtps2dq m3, m0 ; m3 = (int)m0 >> + movaps [outYq + r4q], m3 > > The orps instruction could be done using a memory operand instead of > movaps+orps. I think I had tried that, but it was slower on my CPU. I redid the benchmarks results. - synthetic K=3, N=176 : old : 13170 13302 13340 13458 13562 new : 13215 13218 13273 13344 13805 - sample 1 (40min) default old : 2293 2300 2328 2331 2350 new : 2305 2307 2325 2333 2334 - sample2 (2h) default old : 2462 2468 2497 2519 2519 new : 2481 2486 2514 2529 2525 Do you insist on doing the change? > AVX supports unaligned memory operands so the same could conditionally > be done for the andps as well but that's a bit of a nit. > >> +%if ARCH_X86_64 == 0 ;sbrdsp >> + movss r0m, xmm6 ; return (float)Syy_norm >> + fld dword r0m >> +%else >> + movaps m0, m6 ; return (float)Syy_norm >> +%endif >> + RET > > The reg-reg movaps on x64 could be optimized away by reordering > registers so that the return value ends up in m0 instead of m6 from > the start. No, it cannot. I need m0 for the blendv mask on sse4. >> +INIT_XMM avx > > The code has a bunch of if conditions for ymm regs but ymm regs are > never actually used? I was asked to remove the INIT_YMM avx2 . The benchmarks showed that the resulting code is slightly slower in the default case, so I had the code disabled. But a lot of people cannot stand having disabled code around. :| If you want to help, you might try to find out why the code is slower. My main theory is that 8 floats at once might be just too much, 90% of the searches are with N<32. I don't have avx2 capable cpu to play with. Best Regards
On Tue, Aug 1, 2017 at 11:46 PM, Ivan Kalvachev <ikalvachev@gmail.com> wrote: > On 7/31/17, Henrik Gramner <henrik@gramner.com> wrote: >> Use rN instead of rNq for numbered registers (q suffix is used for >> named args only due to preprocessor limitations). > > Is this documented? Not sure, but there's probably a bunch small things like this that aren't really well documented. >> Use the same "standard" vertical alignment rules as most existing >> code, e.g. instructions indented by 4 spaces and operands aligned on >> the first comma. > > What do you mean operands aligned on the first comma? > The longest operand I had is "xmm0" , counting comma and space > I get 6 position alignment for operands. > Now, with "xm7" i can lower this to 5 position. Should I do that? > (I don't have "xm15" ). > 1234_____________1234_1234_123 VBROADCASTSS ym1, xm1 BLENDVPS m1, m2, m3 is the most commonly used alignment. >> Unless aligning every single loop entry helps a lot I'd avoid it since >> it does waste a bit of icache. > > I'll redo my benchmarks, but I have placed these aligns for a reason. > At some point removed debug instructions started making my code > slower. So I've placed align to avoid random slowdowns. Ok. >> Explicitly using the carry flag as a branch condition is a bit weird. >> Generally using jg/jl/jge/jle tends to be clearer. > > I use it to take advantage of the so called MacroFusion. > It allows the merge of cmp and jxx, as long as the branch > checks only one flag, so all signed branches are to be avoided > (as stated by intel manual). > The newer the cpu, the more opcodes (add/sub) > could be merged and less limitations. Every µarch that can do macro-op fusion with add/sub can do so with both signed and unsigned branch conditions. There's actually only a single µarch that can fuse cmp with unsigned but not signed branch conditions and that's more than a decade old (Conroe), but if you want to check if (unsigned)a < (unsigned)b 'jb' is preferred of 'jc' (both produce the exact same opcode). >> Assembler-specific ifdeffery should be avoided in individual files. >> Something equivalent already exists in x86inc actually but I don't >> remember if it was merged to FFmpeg from upstream (x264) yet. > > Definitely not merged. > > I hear a lot about the improved x86inc, > maybe it is time for you to merge it in FFmpeg? Now that I think about it I believe that part wasn't merged because smartalign is broken on some older nasm versions (which is another reason for avoiding things like this in individual files) and FFmpeg doesn't enforce any specific version requirements. I guess it could be worked around with some version checking ifdeffery if we know which versions are affected. >> Isn't this just an overly complicated way of expressing "dd 0*4, 1*4, >> 2*4, 3*4, 4*4, 5*4, 6*4, 7*4"? > > Yes. > Do you know a way that works with "times 8"? > I've done it this way to make it easy to change the size of the constant. > > Do you request I change it? I'd prefer just using that simple one-liner I posted. Replacing the numbers if you want to change things later is trivial. >> Is reordering moves for the non-AVX path worth the additional >> complexity? Microoptimizations that only affect legacy hardware are >> IMO a bit questionable. > > Yes, I'm on "legacy" hardware and the improvement is reliably measurable. Ok, in that case you should also use shufps/addps instead of vshufps/vaddps in the AVX branch for cosmetic reasons though (x86inc performs automatic VEX-encoding). >> The AVX1 integer warning is kind of silly and doesn't really serve any >> purpose. > > As I've said in this thread before, > If you use avx1 ymm with this macro > you won't get any warning that avx2 opcode has been generated > because x86inc does not overload avx2. Sure, but using any other AVX2 instruction in AVX functions wont result in any warnings either so it doesn't make much sense to do it in this specific case. >> Use the existing x86util VBROADCASTSS macro. Modify it if it's lacking >> something. > > The x86util VBROADCASTSS macro implements only > the avx1 variant that is memory to register. > My variant emulates the avx2 variant, > that also does reg to reg. > > My variant also avoids write dependency on "movss reg, reg". > ("movss reg, mem" clears the other elements, but > "movss reg, reg" preserves them. This creates dependency on the old > values of the register.) > > And yes, I did ask if I should separate > these macros in another file and > if I should try to merge them in x86util.asm first. > > What do you think on the matter? Add your improvements to the existing x86util macro (can be done in the same patch) and then use that. >> +%macro EMU_pbroadcastd 2 ; dst, src >> +%if cpuflag(avx2) >> + vpbroadcastd %1, %2 >> +%elif cpuflag(avx) && mmsize >= 32 >> + %error AVX1 does not support integer 256 bit ymm operations >> +%else >> + %ifnum sizeof%2 ; sse2 register >> + pshufd %1, %2, q0000 >> + %else ; sse memory >> + movd %1, %2 ; movd zero extends >> + pshufd %1, %1, 0 >> + %endif >> +%endif >> +%endmacro >> >> Same as above, but using the SPLATD macro. > > Again, SPLATD does not handle avx2. > Also I do think it is better to use a name of existing instruction > instead of a made up one. Same as above. Yes, I agree that the current naming is inconsistent (that macro was created before broadcast instructions existed) and I'm OK with renaming it, but if so it should be done in a separate patch. >> Use the actual cmpss instruction instead of psuedo-instructions. >> x86inc doesn't support automatic VEX handling for every possible >> psuedo-instruction (mainly because there's so many of them) which will >> result in AVX state transitions if INIT_YMM is used. > > Not according to Intel Architecture Code Analyzer > and the benchmarks I've got. > The benchmarks showed that using cmpps is about 5% slower on Ryzen > and absolutely same speed on SkyLake. > That's in avx2 ymm. cmpss, not cmpps. "cmpless a, b" is a psuedo-instruction which the assembler will convert into the actual instruction "cmpss a, b, 2". One could of course argue whether or not x86inc should deal with psuedo-instructions but right now it doesn't so it will cause AVX state transitions if INIT_YMM is used since it wont be VEX-encoded. >> This macro is only ever used once, remove and inline it to reduce >> complexity. > > Can I move it to x86util too? > > Unified handling of PIC mangling is something welcomed. > I do think this is the right way. ;) > I think that at least one other file had something very similar. Doing PIC-mangling optimally will in many cases be context-dependent which is probably why it doesn't already exist. For example if you need to do multiple complex PIC-loads you should do a single lea and reuse it which is harder to convey as a generic macro. >> IIRC the number of used mmregs is ignored in 32-bit mode so I believe >> you can unconditionally specify 11. > > Is that documented and guaranteed to not change in future? > I had it at 11 unconditionally. But then decided to play safe. There are no callee-saved xmm registers or anything else in x86-32 that would require keeping track of how many of them are clobbered. >> Potential use case for 2x FMA instead of 2x mul+add. > > FMA is newer than AVX1. To actually use the opcodes > I'd need to have 4'th version just for it or use avx2. > > Without possibility of using FMA, the macro usage > would be just an obfuscation. > > Do you insist on it? I wouldn't bother with a separate FMA version but if you end up doing AVX2 it should definitely be done as part of it (AVX2 is a superset of FMA3). >>> +%%restore_sign_loop: >>> + movaps m0, [tmpY + r4q] ; m0 = Y[i] >>> + movups m1, [inXq + r4q] ; m1 = X[i] >>> + andps m1, m2 ; m1 = sign(X[i]) >>> + orps m0, m1 ; m0 = Y[i]*sign >>> + cvtps2dq m3, m0 ; m3 = (int)m0 >>> + movaps [outYq + r4q], m3 >> >> The orps instruction could be done using a memory operand instead of >> movaps+orps. > > I think I had tried that, but it was slower on my CPU. That's quite weird if it's the case. It's literally the same amount of µops just with a tiny bit shorter code size, so I would probably chalk it up to random differences in code alignment or something. Either way it's really not important so feel free to keep it as it is. >> The reg-reg movaps on x64 could be optimized away by reordering >> registers so that the return value ends up in m0 instead of m6 from >> the start. > > No, it cannot. > I need m0 for the blendv mask on sse4. Oh right, I'm dumb. Never mind. >> The code has a bunch of if conditions for ymm regs but ymm regs are >> never actually used? > > I was asked to remove the INIT_YMM avx2 . > > The benchmarks showed that the resulting code is > slightly slower in the default case, so I had the code disabled. > But a lot of people cannot stand having disabled code around. :| > > If you want to help, you might try to find out why the code is slower. > My main theory is that 8 floats at once might be just too much, > 90% of the searches are with N<32. > > I don't have avx2 capable cpu to play with. Likely due to the cmpless AVX state transition I mentioned earlier. Could be worth re-testing once that is taken care of.
On 8/2/17, Henrik Gramner <henrik@gramner.com> wrote: > On Tue, Aug 1, 2017 at 11:46 PM, Ivan Kalvachev <ikalvachev@gmail.com> > wrote: >> On 7/31/17, Henrik Gramner <henrik@gramner.com> wrote: >>> Use rN instead of rNq for numbered registers (q suffix is used for >>> named args only due to preprocessor limitations). >> >> Is this documented? > > Not sure, but there's probably a bunch small things like this that > aren't really well documented. > >>> Use the same "standard" vertical alignment rules as most existing >>> code, e.g. instructions indented by 4 spaces and operands aligned on >>> the first comma. >> >> What do you mean operands aligned on the first comma? >> The longest operand I had is "xmm0" , counting comma and space >> I get 6 position alignment for operands. >> Now, with "xm7" i can lower this to 5 position. Should I do that? >> (I don't have "xm15" ). >> > > 1234_____________1234_1234_123 > VBROADCASTSS ym1, xm1 > BLENDVPS m1, m2, m3 > > is the most commonly used alignment. I see that a lot of .asm files use different alignments. I'll try to pick something similar that I find good looking. I also see that different function/macro can have different position for the first comma. This could be quite useful, to visually separate the macros. >>> Unless aligning every single loop entry helps a lot I'd avoid it since >>> it does waste a bit of icache. >> >> I'll redo my benchmarks, but I have placed these aligns for a reason. >> At some point removed debug instructions started making my code >> slower. So I've placed align to avoid random slowdowns. > > Ok. > >>> Explicitly using the carry flag as a branch condition is a bit weird. >>> Generally using jg/jl/jge/jle tends to be clearer. >> >> I use it to take advantage of the so called MacroFusion. >> It allows the merge of cmp and jxx, as long as the branch >> checks only one flag, so all signed branches are to be avoided >> (as stated by intel manual). >> The newer the cpu, the more opcodes (add/sub) >> could be merged and less limitations. > > Every µarch that can do macro-op fusion with add/sub can do so with > both signed and unsigned branch conditions. > There's actually only a single µarch that can fuse cmp with unsigned > but not signed branch conditions and that's more than a decade old > (Conroe), but if you want to check if (unsigned)a < (unsigned)b 'jb' > is preferred of 'jc' (both produce the exact same opcode). One is enough reason for me :} Since the same thing works just as well or better on newer CPU's. Also, using above/bellow on arithmetic operations is a lot more confusing than carry/borrow. You can see that I use "jc" and "jnc" after "sub". I'll keep it as it is, unless you have solid reason to change it. >>> Assembler-specific ifdeffery should be avoided in individual files. >>> Something equivalent already exists in x86inc actually but I don't >>> remember if it was merged to FFmpeg from upstream (x264) yet. >> >> Definitely not merged. >> >> I hear a lot about the improved x86inc, >> maybe it is time for you to merge it in FFmpeg? > > Now that I think about it I believe that part wasn't merged because > smartalign is broken on some older nasm versions (which is another > reason for avoiding things like this in individual files) and FFmpeg > doesn't enforce any specific version requirements. I guess it could be > worked around with some version checking ifdeffery if we know which > versions are affected. I don't see anything relevant in here: http://repo.or.cz/nasm.git/history/HEAD:/macros/smartalign.mac I also didn't notice anything similar in the changelog. I don't think this is relevant reason for not merging avx2 support. ;) >>> Isn't this just an overly complicated way of expressing "dd 0*4, 1*4, >>> 2*4, 3*4, 4*4, 5*4, 6*4, 7*4"? >> >> Yes. >> Do you know a way that works with "times 8"? >> I've done it this way to make it easy to change the size of the constant. >> >> Do you request I change it? > > I'd prefer just using that simple one-liner I posted. Replacing the > numbers if you want to change things later is trivial. It might be trivial, but it also makes it easier to misspell something. I'd prefer to keep it as it is. >>> Is reordering moves for the non-AVX path worth the additional >>> complexity? Microoptimizations that only affect legacy hardware are >>> IMO a bit questionable. >> >> Yes, I'm on "legacy" hardware and the improvement is reliably measurable. > > Ok, in that case you should also use shufps/addps instead of > vshufps/vaddps in the AVX branch for cosmetic reasons though (x86inc > performs automatic VEX-encoding). Only on instructions it knows... that brings us again to the AVX2. I had it mixed before, but I changed it to be consistent. Some new avx instruction are not overloaded or available without their "v-" prefix. e.g. x86util uses vpbroadcastw in SPLATW. It makes more sense to use "v" prefix for code that is guaranteed to be avx. >>> The AVX1 integer warning is kind of silly and doesn't really serve any >>> purpose. >> >> As I've said in this thread before, >> If you use avx1 ymm with this macro >> you won't get any warning that avx2 opcode has been generated >> because x86inc does not overload avx2. > > Sure, but using any other AVX2 instruction in AVX functions wont > result in any warnings either so it doesn't make much sense to do it > in this specific case. Isn't one of the selling points for x86inc that it would warn if unsupported instructions are used? To be honest, at first I had that function use equivalent logical operation from the float point domain. It does impose penalty on domain transition and I do think that doing that behind developer back is bad idea. This is why I think that this error should remain. At least, until usage of avx2 in avx1 code gives errors. At that point the error would be redundant and could be removed. >>> Use the existing x86util VBROADCASTSS macro. Modify it if it's lacking >>> something. >> >> The x86util VBROADCASTSS macro implements only >> the avx1 variant that is memory to register. >> My variant emulates the avx2 variant, >> that also does reg to reg. >> >> My variant also avoids write dependency on "movss reg, reg". >> ("movss reg, mem" clears the other elements, but >> "movss reg, reg" preserves them. This creates dependency on the old >> values of the register.) >> >> And yes, I did ask if I should separate >> these macros in another file and >> if I should try to merge them in x86util.asm first. >> >> What do you think on the matter? > > Add your improvements to the existing x86util macro (can be done in > the same patch) and then use that. FFmpeg guidelines request that it is a separate patch in separate mail. >>> +%macro EMU_pbroadcastd 2 ; dst, src >>> +%if cpuflag(avx2) >>> + vpbroadcastd %1, %2 >>> +%elif cpuflag(avx) && mmsize >= 32 >>> + %error AVX1 does not support integer 256 bit ymm operations >>> +%else >>> + %ifnum sizeof%2 ; sse2 register >>> + pshufd %1, %2, q0000 >>> + %else ; sse memory >>> + movd %1, %2 ; movd zero extends >>> + pshufd %1, %1, 0 >>> + %endif >>> +%endif >>> +%endmacro >>> >>> Same as above, but using the SPLATD macro. >> >> Again, SPLATD does not handle avx2. >> Also I do think it is better to use a name of existing instruction >> instead of a made up one. > > Same as above. Yes, I agree that the current naming is inconsistent > (that macro was created before broadcast instructions existed) and I'm > OK with renaming it, but if so it should be done in a separate patch. Just renaming it is not enough, because SPLATD works differently (it can broadcast second, third or forth element too) and it is already used in existing code. Maybe PBROADCASTD macro could use SPLATD internally. >>> Use the actual cmpss instruction instead of psuedo-instructions. >>> x86inc doesn't support automatic VEX handling for every possible >>> psuedo-instruction (mainly because there's so many of them) which will >>> result in AVX state transitions if INIT_YMM is used. >> >> Not according to Intel Architecture Code Analyzer >> and the benchmarks I've got. >> The benchmarks showed that using cmpps is about 5% slower on Ryzen >> and absolutely same speed on SkyLake. >> That's in avx2 ymm. > > cmpss, not cmpps. "cmpless a, b" is a psuedo-instruction which the > assembler will convert into the actual instruction "cmpss a, b, 2". "cmpps" is not scalar, but it is handled by x86inc macros and converted to "vcmpps", so no SSE penalties are possible with it. Read again what I wrote previously. > One could of course argue whether or not x86inc should deal with > psuedo-instructions but right now it doesn't so it will cause AVX > state transitions if INIT_YMM is used since it wont be VEX-encoded. Let's say that using a number for selecting comparison mode is NOT developer friendly. >>> This macro is only ever used once, remove and inline it to reduce >>> complexity. >> >> Can I move it to x86util too? >> >> Unified handling of PIC mangling is something welcomed. >> I do think this is the right way. ;) >> I think that at least one other file had something very similar. > > Doing PIC-mangling optimally will in many cases be context-dependent > which is probably why it doesn't already exist. For example if you > need to do multiple complex PIC-loads you should do a single lea and > reuse it which is harder to convey as a generic macro. I think that this macro can help with that too: e.g. SET_PIC_BASE lea, r5, const_1 movaps m1, [pic_base_const_1 + r2 ] movaps m2, [pic_base_const_1 + const_2 - const_1 + r2] I guess you might want a second macro, something like: %define pic_mangle(a,b) (pic_base_%+ a + b - a) movaps m2, [pic_mangle(const_1, const_2) ] >>> IIRC the number of used mmregs is ignored in 32-bit mode so I believe >>> you can unconditionally specify 11. >> >> Is that documented and guaranteed to not change in future? >> I had it at 11 unconditionally. But then decided to play safe. > > There are no callee-saved xmm registers or anything else in x86-32 > that would require keeping track of how many of them are clobbered. OK, will "simplify" it then. >>> Potential use case for 2x FMA instead of 2x mul+add. >> >> FMA is newer than AVX1. To actually use the opcodes >> I'd need to have 4'th version just for it or use avx2. >> >> Without possibility of using FMA, the macro usage >> would be just an obfuscation. >> >> Do you insist on it? > > I wouldn't bother with a separate FMA version but if you end up doing > AVX2 it should definitely be done as part of it (AVX2 is a superset of > FMA3). > >>>> +%%restore_sign_loop: >>>> + movaps m0, [tmpY + r4q] ; m0 = Y[i] >>>> + movups m1, [inXq + r4q] ; m1 = X[i] >>>> + andps m1, m2 ; m1 = sign(X[i]) >>>> + orps m0, m1 ; m0 = Y[i]*sign >>>> + cvtps2dq m3, m0 ; m3 = (int)m0 >>>> + movaps [outYq + r4q], m3 >>> >>> The orps instruction could be done using a memory operand instead of >>> movaps+orps. >> >> I think I had tried that, but it was slower on my CPU. > > That's quite weird if it's the case. It's literally the same amount of > µops just with a tiny bit shorter code size, so I would probably chalk > it up to random differences in code alignment or something. Either way > it's really not important so feel free to keep it as it is. I don't leave alignment to the chance ;) Have in mind, uops have different latency so the order still matters. I'll keep it. >>> The reg-reg movaps on x64 could be optimized away by reordering >>> registers so that the return value ends up in m0 instead of m6 from >>> the start. >> >> No, it cannot. >> I need m0 for the blendv mask on sse4. > > Oh right, I'm dumb. Never mind. > >>> The code has a bunch of if conditions for ymm regs but ymm regs are >>> never actually used? >> >> I was asked to remove the INIT_YMM avx2 . >> >> The benchmarks showed that the resulting code is >> slightly slower in the default case, so I had the code disabled. >> But a lot of people cannot stand having disabled code around. :| >> >> If you want to help, you might try to find out why the code is slower. >> My main theory is that 8 floats at once might be just too much, >> 90% of the searches are with N<32. >> >> I don't have avx2 capable cpu to play with. > > Likely due to the cmpless AVX state transition I mentioned earlier. > Could be worth re-testing once that is taken care of. No, it is not. The cmpps version definitely got converted to vcmpps and no SSE penalty could have been imposed on it. The "vcmpps" had absolutely the same speed on Skylake. On Ryzen it was slower. The reason for the slowdown is somewhere else. As for now, should I bring back the disabled "INIT_YMM avx2" or leave it as it is? Best Regards
On Thu, Aug 3, 2017 at 11:36 PM, Ivan Kalvachev <ikalvachev@gmail.com> wrote: >> 1234_____________1234_1234_123 >> VBROADCASTSS ym1, xm1 >> BLENDVPS m1, m2, m3 >> >> is the most commonly used alignment. > > I see that a lot of .asm files use different alignments. > I'll try to pick something similar that I find good looking. > > I also see that different function/macro can have > different position for the first comma. > This could be quite useful, to visually separate > the macros. Things like indentation can be a bit inconsistent at times, yes. Cody by different authors written over the span of many year etc. It's normal to diverge from "standard alignment" in cases when macro names (or memory addresses) are fairly long. Otherwise the amount of whitespace between instructions and operands would easily get too large to be practical. >> Now that I think about it I believe that part wasn't merged because >> smartalign is broken on some older nasm versions (which is another >> reason for avoiding things like this in individual files) and FFmpeg >> doesn't enforce any specific version requirements. I guess it could be >> worked around with some version checking ifdeffery if we know which >> versions are affected. > > I don't see anything relevant in here: > http://repo.or.cz/nasm.git/history/HEAD:/macros/smartalign.mac > I also didn't notice anything similar in the changelog. http://repo.or.cz/nasm.git/commit/f29123b936b1751c6d72dad86b05eb293cca5c6c https://bugzilla.nasm.us/show_bug.cgi?id=3392319 > I had it mixed before, but I changed it to be consistent. > Some new avx instruction are not overloaded or > available without their "v-" prefix. > e.g. x86util uses vpbroadcastw in SPLATW. Every instruction that has both a legacy encoding and a VEX-encoding will be automatically converted to VEX in AVX functions when written without the v-prefix. There is no legacy encoding for newer instructions so there's no reason for x86inc to overload those. > Isn't one of the selling points for x86inc that > it would warn if unsupported instructions are used? Not really, no. It's done in a small subset of cases as a part of the aforementioned auto legacy/VEX conversion because it was trivial to add that functionality, but correctly tracking every single x86 instruction in existence would be a whole different thing. Some instructions requires different instruction sets not just depending on the vector width but the operand types as well (for example, pextrw from mm to gpr requires mmx2, from xmm to gpr requires sse2, from xmm to memory requires sse4.1) and there's no generic way to handle every special corner case which would make things fairly complex. I'm guessing doing so would also cause a significant reduction in compilation speed which I'm sure some people would dislike. >> Add your improvements to the existing x86util macro (can be done in >> the same patch) and then use that. > > FFmpeg guidelines request that it is a separate patch in separate mail. Doesn't really seem to be followed historically by looking at the git history, but doing things by the book is never wrong so making it a separate commit might be the correct choice. > Just renaming it is not enough, because SPLATD > works differently (it can broadcast second, third or forth element too) > and it is already used in existing code. > Maybe PBROADCASTD macro could use SPLATD internally. SPLATD is hardcoded to broadcast the lowest element. http://git.videolan.org/?p=ffmpeg.git;a=blob;f=libavutil/x86/x86util.asm;h=cc7d272cada6d73e5ddc42ae359491086a206743;hb=HEAD#l782 Ideally the SSE branch of SPLATD should be also removed and any float code using it should be changed to use the BROADCASTSS macro instead, but that's perhaps going a bit off-topic. > "cmpps" is not scalar, but it is handled by x86inc macros and converted > to "vcmpps", so no SSE penalties are possible with it. > > Read again what I wrote previously. 'cmpps' is packed, not scalar (hence the 'p' instead of 's') so it obviously shouldn't be used in scalar code. I believe you misread my initial comment which said to use 'cmpss' instead of 'cmpless'. >> One could of course argue whether or not x86inc should deal with >> psuedo-instructions but right now it doesn't so it will cause AVX >> state transitions if INIT_YMM is used since it wont be VEX-encoded. > > Let's say that using a number for selecting comparison mode > is NOT developer friendly. I did indeed say one can argue about it. I can see if there's any clean way of handling it that doesn't require a hundred lines of code. >> Doing PIC-mangling optimally will in many cases be context-dependent >> which is probably why it doesn't already exist. For example if you >> need to do multiple complex PIC-loads you should do a single lea and >> reuse it which is harder to convey as a generic macro. > > I think that this macro can help with that too: > e.g. > SET_PIC_BASE lea, r5, const_1 > movaps m1, [pic_base_const_1 + r2 ] > movaps m2, [pic_base_const_1 + const_2 - const_1 + r2] > > I guess you might want a second macro, something like: > %define pic_mangle(a,b) (pic_base_%+ a + b - a) > movaps m2, [pic_mangle(const_1, const_2) ] Hmm, something like this might be doable (completely untested): %macro SET_PIC_BASE 1-2 $$ ; reg, base %ifdef PIC lea %1, [%2] %xdefine WRT_PIC_BASE +%1-%2 %else %xdefine WRT_PIC_BASE %endif %endmacro SET_PIC_BASE r5, const_1 movaps m1, [const_1 + r2 WRT_PIC_BASE] movaps m1, [const_2 + r2 WRT_PIC_BASE] Note that it's generally not possible to create a valid relocation between symbols if one or more of them are external. In that case the relocations needs to be relative to some local reference point, such as $$, which in the above example would be done by simply omitting the base: SET_PIC_BASE r5 It's of course possible to just always use $$, but it wastes a bit of code size by always forcing imm32 offsets so it'd be preferable to leave it optional. > The cmpps version definitely got converted to vcmpps > and no SSE penalty could have been imposed on it. > > The "vcmpps" had absolutely the same speed on Skylake. > On Ryzen it was slower. > > The reason for the slowdown is somewhere else. The code you posted used cmpless, not cmpps. Pseudo-instruction do _NOT_ automatically become VEX-encoded in AVX mode, feel free to look at the disassembly. > As for now, > should I bring back the disabled "INIT_YMM avx2" > or leave it as it is? If you tested it with no AVX state transitions and it was still slower than 128-bit SIMD on a CPU with 256-bit SIMD units than it's of course better to just stick with the existing 128-bit code.
On 8/4/17, Henrik Gramner <henrik@gramner.com> wrote: > On Thu, Aug 3, 2017 at 11:36 PM, Ivan Kalvachev <ikalvachev@gmail.com> > wrote: >>> 1234_____________1234_1234_123 >>> VBROADCASTSS ym1, xm1 >>> BLENDVPS m1, m2, m3 >>> >>> is the most commonly used alignment. >> >> I see that a lot of .asm files use different alignments. >> I'll try to pick something similar that I find good looking. >> >> I also see that different function/macro can have >> different position for the first comma. >> This could be quite useful, to visually separate >> the macros. > > Things like indentation can be a bit inconsistent at times, yes. Cody > by different authors written over the span of many year etc. > > It's normal to diverge from "standard alignment" in cases when macro > names (or memory addresses) are fairly long. Otherwise the amount of > whitespace between instructions and operands would easily get too > large to be practical. I tried different things. The one space after comma indeed looks best. I used 2 spaces after instruction. >>> Now that I think about it I believe that part wasn't merged because >>> smartalign is broken on some older nasm versions (which is another >>> reason for avoiding things like this in individual files) and FFmpeg >>> doesn't enforce any specific version requirements. I guess it could be >>> worked around with some version checking ifdeffery if we know which >>> versions are affected. >> >> I don't see anything relevant in here: >> http://repo.or.cz/nasm.git/history/HEAD:/macros/smartalign.mac >> I also didn't notice anything similar in the changelog. > http://repo.or.cz/nasm.git/commit/f29123b936b1751c6d72dad86b05eb293cca5c6c > https://bugzilla.nasm.us/show_bug.cgi?id=3392319 This would break generation of dependencies? I think that FFmpeg uses combined mode with nasm, so it should not call preprocessor only mode, unless DBG=1 is used. I'll investigate this further. (revert the nasm fix, try building ffmpeg dependencies ...) >> I had it mixed before, but I changed it to be consistent. >> Some new avx instruction are not overloaded or >> available without their "v-" prefix. >> e.g. x86util uses vpbroadcastw in SPLATW. > > Every instruction that has both a legacy encoding and a VEX-encoding > will be automatically converted to VEX in AVX functions when written > without the v-prefix. There is no legacy encoding for newer > instructions so there's no reason for x86inc to overload those. That's great, but it doesn't address the original problem. Do you insist on removing the "v-" prefix from AVX only instructions? You said that this is purely cosmetic and I think that it would make compilation a bit slower, because of unnecessary macro expansions. ;) >> Isn't one of the selling points for x86inc that >> it would warn if unsupported instructions are used? > > Not really, no. It's done in a small subset of cases as a part of the > aforementioned auto legacy/VEX conversion because it was trivial to > add that functionality, but correctly tracking every single x86 > instruction in existence would be a whole different thing. > > Some instructions requires different instruction sets not just > depending on the vector width but the operand types as well (for > example, pextrw from mm to gpr requires mmx2, from xmm to gpr requires > sse2, from xmm to memory requires sse4.1) and there's no generic way > to handle every special corner case which would make things fairly > complex. I'm guessing doing so would also cause a significant > reduction in compilation speed which I'm sure some people would > dislike. There is already check for register size and integer operations, because SSE1 does not have 128bit xmm integer operations. It should be just few lines of code to add the same check for AVX1 and 256bit ymm integer operations. (I'm not confident enough to mess with that code, yet.) As for the immediate task: The "error" message stays. Maybe I should also add "use PBROADCASTD instead" to it? >>> Add your improvements to the existing x86util macro (can be done in >>> the same patch) and then use that. >> >> FFmpeg guidelines request that it is a separate patch in separate mail. > > Doesn't really seem to be followed historically by looking at the git > history, but doing things by the book is never wrong so making it a > separate commit might be the correct choice. Will do. >> Just renaming it is not enough, because SPLATD >> works differently (it can broadcast second, third or forth element too) >> and it is already used in existing code. >> Maybe PBROADCASTD macro could use SPLATD internally. > > SPLATD is hardcoded to broadcast the lowest element. > > http://git.videolan.org/?p=ffmpeg.git;a=blob;f=libavutil/x86/x86util.asm;h=cc7d272cada6d73e5ddc42ae359491086a206743;hb=HEAD#l782 > > Ideally the SSE branch of SPLATD should be also removed and any float > code using it should be changed to use the BROADCASTSS macro instead, > but that's perhaps going a bit off-topic. It seems I've been looking at x264 x86util.asm: https://git.videolan.org/?p=x264.git;a=blob;f=common/x86/x86util.asm;h=7a140ebd93ccd845b61bfbe1e7e379b09d41d293;hb=HEAD#l280 I won't be using SPLATD in PBROADCASTD. >> "cmpps" is not scalar, but it is handled by x86inc macros and converted >> to "vcmpps", so no SSE penalties are possible with it. >> >> Read again what I wrote previously. > > 'cmpps' is packed, not scalar (hence the 'p' instead of 's') so it > obviously shouldn't be used in scalar code. I believe you misread my > initial comment which said to use 'cmpss' instead of 'cmpless'. cmpss/ps seem to have same latency/throughput on AVX capable CPUs. I'm using cmpss simply because I need to compare only the lowest element. It's the only scalar instruction in that block. Anyway, already changed the cmpless to cmpss. Maybe I'll be able to ask for another benchmark on Ryzen. >>> One could of course argue whether or not x86inc should deal with >>> psuedo-instructions but right now it doesn't so it will cause AVX >>> state transitions if INIT_YMM is used since it wont be VEX-encoded. >> >> Let's say that using a number for selecting comparison mode >> is NOT developer friendly. > > I did indeed say one can argue about it. I can see if there's any > clean way of handling it that doesn't require a hundred lines of code. Hundred?? cmpless is already handled by the compiler. I think all you'd need to handle it is: AVX_INSTR cmpless, sse, 1, 0, 0 and the same for cmpeqss, cmpeqps, cmpltss, cmpltps, cmpleps, etc... 8 packed, 8 scalar. Unless I miss something (and as I've said before, I'm not confident enough to mess with that code.) (AVX does extend to 32 variants, but they are not SSE compatible, so no need to emulate them.) >>> Doing PIC-mangling optimally will in many cases be context-dependent >>> which is probably why it doesn't already exist. For example if you >>> need to do multiple complex PIC-loads you should do a single lea and >>> reuse it which is harder to convey as a generic macro. >> >> I think that this macro can help with that too: >> e.g. >> SET_PIC_BASE lea, r5, const_1 >> movaps m1, [pic_base_const_1 + r2 ] >> movaps m2, [pic_base_const_1 + const_2 - const_1 + r2] >> >> I guess you might want a second macro, something like: >> %define pic_mangle(a,b) (pic_base_%+ a + b - a) >> movaps m2, [pic_mangle(const_1, const_2) ] > > Hmm, something like this might be doable (completely untested): > > %macro SET_PIC_BASE 1-2 $$ ; reg, base > %ifdef PIC > lea %1, [%2] > %xdefine WRT_PIC_BASE +%1-%2 > %else > %xdefine WRT_PIC_BASE > %endif > %endmacro > > SET_PIC_BASE r5, const_1 > movaps m1, [const_1 + r2 WRT_PIC_BASE] > movaps m1, [const_2 + r2 WRT_PIC_BASE] movaps m1, [WRT_PIC_BASE + const_2 + r2 ] Looks better. (Also not tested. Will do, later.) > Note that it's generally not possible to create a valid relocation > between symbols if one or more of them are external. In that case the > relocations needs to be relative to some local reference point, such > as $$, which in the above example would be done by simply omitting the > base: > > SET_PIC_BASE r5 > > It's of course possible to just always use $$, but it wastes a bit of > code size by always forcing imm32 offsets so it'd be preferable to > leave it optional. Yeh $$ is the start of the current section, and that's is going to be ".text" not "rodata". Anyway, for now I'll keep my PIC macro code in my file. OK? >> The cmpps version definitely got converted to vcmpps >> and no SSE penalty could have been imposed on it. >> >> The "vcmpps" had absolutely the same speed on Skylake. >> On Ryzen it was slower. >> >> The reason for the slowdown is somewhere else. > > The code you posted used cmpless, not cmpps. Pseudo-instruction do > _NOT_ automatically become VEX-encoded in AVX mode, feel free to look > at the disassembly. And I've been saying that I asked people to do benchmarks with "cmpless" changed to "cmpps", just to rule out SSE vs AVX. >> As for now, >> should I bring back the disabled "INIT_YMM avx2" >> or leave it as it is? > > If you tested it with no AVX state transitions and it was still slower YES! > than 128-bit SIMD on a CPU with 256-bit SIMD units than it's of course > better to just stick with the existing 128-bit code. This comes close, but does not exactly answer my question. Should I restore the disabled code (as disabled code), or should I just keep it deleted, so other people would also wonder "What's going on here?!". ;) Disabled or Deleted. Best Regards
On 4 August 2017 at 23:58, Ivan Kalvachev <ikalvachev@gmail.com> wrote: > > >> I had it mixed before, but I changed it to be consistent. > >> Some new avx instruction are not overloaded or > >> available without their "v-" prefix. > >> e.g. x86util uses vpbroadcastw in SPLATW. > > > > Every instruction that has both a legacy encoding and a VEX-encoding > > will be automatically converted to VEX in AVX functions when written > > without the v-prefix. There is no legacy encoding for newer > > instructions so there's no reason for x86inc to overload those. > > That's great, but it doesn't address the original problem. > Do you insist on removing the "v-" prefix from AVX only instructions? > > I insist.
On 8/5/17, Rostislav Pehlivanov <atomnuker@gmail.com> wrote: > On 4 August 2017 at 23:58, Ivan Kalvachev <ikalvachev@gmail.com> wrote: > >> >> >> I had it mixed before, but I changed it to be consistent. >> >> Some new avx instruction are not overloaded or >> >> available without their "v-" prefix. >> >> e.g. x86util uses vpbroadcastw in SPLATW. >> > >> > Every instruction that has both a legacy encoding and a VEX-encoding >> > will be automatically converted to VEX in AVX functions when written >> > without the v-prefix. There is no legacy encoding for newer >> > instructions so there's no reason for x86inc to overload those. >> >> That's great, but it doesn't address the original problem. >> Do you insist on removing the "v-" prefix from AVX only instructions? >> >> > I insist. If you insist, then you should have a good technical reason for it. Something that is not based on looks, cosmetics or "it has always been like this". For example, slow compilation because of unnecessary macro expansion is a technical reason to keep the "v-" prefix. ;)
On Sat, Aug 5, 2017 at 12:21 PM, Ivan Kalvachev <ikalvachev@gmail.com> wrote: > On 8/5/17, Rostislav Pehlivanov <atomnuker@gmail.com> wrote: >> On 4 August 2017 at 23:58, Ivan Kalvachev <ikalvachev@gmail.com> wrote: >> >>> >>> >> I had it mixed before, but I changed it to be consistent. >>> >> Some new avx instruction are not overloaded or >>> >> available without their "v-" prefix. >>> >> e.g. x86util uses vpbroadcastw in SPLATW. >>> > >>> > Every instruction that has both a legacy encoding and a VEX-encoding >>> > will be automatically converted to VEX in AVX functions when written >>> > without the v-prefix. There is no legacy encoding for newer >>> > instructions so there's no reason for x86inc to overload those. >>> >>> That's great, but it doesn't address the original problem. >>> Do you insist on removing the "v-" prefix from AVX only instructions? >>> >>> >> I insist. > > If you insist, then you should have a good technical reason for it. > Something that is not based on looks, cosmetics or "it has always been > like this". > > For example, slow compilation because of unnecessary macro expansion is > a technical reason to keep the "v-" prefix. > Consistency with the typical style of our codebase is also a reason. - Hendrik
On 8/5/17, Hendrik Leppkes <h.leppkes@gmail.com> wrote: > On Sat, Aug 5, 2017 at 12:21 PM, Ivan Kalvachev <ikalvachev@gmail.com> > wrote: >> On 8/5/17, Rostislav Pehlivanov <atomnuker@gmail.com> wrote: >>> On 4 August 2017 at 23:58, Ivan Kalvachev <ikalvachev@gmail.com> wrote: >>> >>>> >>>> >> I had it mixed before, but I changed it to be consistent. >>>> >> Some new avx instruction are not overloaded or >>>> >> available without their "v-" prefix. >>>> >> e.g. x86util uses vpbroadcastw in SPLATW. >>>> > >>>> > Every instruction that has both a legacy encoding and a VEX-encoding >>>> > will be automatically converted to VEX in AVX functions when written >>>> > without the v-prefix. There is no legacy encoding for newer >>>> > instructions so there's no reason for x86inc to overload those. >>>> >>>> That's great, but it doesn't address the original problem. >>>> Do you insist on removing the "v-" prefix from AVX only instructions? >>>> >>>> >>> I insist. >> >> If you insist, then you should have a good technical reason for it. >> Something that is not based on looks, cosmetics or "it has always been >> like this". >> >> For example, slow compilation because of unnecessary macro expansion is >> a technical reason to keep the "v-" prefix. >> > > Consistency with the typical style of our codebase is also a reason. No, it is not. This fall in category of doing things wrong, because we have always done them wrong. Also you brought up the matter of compilation speed, now you don't care about it? ;) Don't worry, I will do the requested change, only because in future it might help with avx1 & ymm integer test. Please answer my other question(s).
On Sat, Aug 5, 2017 at 12:58 AM, Ivan Kalvachev <ikalvachev@gmail.com> wrote: > 8 packed, 8 scalar. > > Unless I miss something (and as I've said before, > I'm not confident enough to mess with that code.) > > (AVX does extend to 32 variants, but they are not > SSE compatible, so no need to emulate them.) Oh, right. I quickly glanced at the docs and saw 32 pseudo-ops for each instruction for a total of 128 when adding pd, ps, sd, ss, but the fact that only the first 8 is relevant here reduces it to 32 which is a lot more manageable. > movaps m1, [WRT_PIC_BASE + const_2 + r2 ] > > Looks better. (Also not tested. Will do, later.) I intentionally used the WRT define at the end because that's most similar to the built in wrt syntax used when accessing symbols through the PLT or GOT, e.g. mov eax, [external_symbol wrt ..got] > Yeh $$ is the start of the current section, and that's is going to be > ".text" not "rodata". Obviously, yes. You need a reference that results in a compile-time constant PC-offset (which .rodata isn't) to create PC-relative relocation records to external symbols.
From 67c93689fab23386cb4d08d4a74d3e804f07c4f4 Mon Sep 17 00:00:00 2001 From: Ivan Kalvachev <ikalvachev@gmail.com> Date: Thu, 8 Jun 2017 22:24:33 +0300 Subject: [PATCH 1/5] SIMD opus pvq_search implementation Explanation on the workings and methods used by the Pyramid Vector Quantization Search function could be found in the following Work-In-Progress mail threads: http://ffmpeg.org/pipermail/ffmpeg-devel/2017-June/212146.html http://ffmpeg.org/pipermail/ffmpeg-devel/2017-June/212816.html http://ffmpeg.org/pipermail/ffmpeg-devel/2017-July/213030.html http://ffmpeg.org/pipermail/ffmpeg-devel/2017-July/213436.html Signed-off-by: Ivan Kalvachev <ikalvachev@gmail.com> --- libavcodec/opus_pvq.c | 3 + libavcodec/opus_pvq.h | 5 +- libavcodec/x86/Makefile | 2 + libavcodec/x86/opus_dsp_init.c | 43 +++ libavcodec/x86/opus_pvq_search.asm | 538 +++++++++++++++++++++++++++++++++++++ 5 files changed, 589 insertions(+), 2 deletions(-) create mode 100644 libavcodec/x86/opus_dsp_init.c create mode 100644 libavcodec/x86/opus_pvq_search.asm diff --git a/libavcodec/opus_pvq.c b/libavcodec/opus_pvq.c index 2ac66a0ede..3aa502929c 100644 --- a/libavcodec/opus_pvq.c +++ b/libavcodec/opus_pvq.c @@ -947,6 +947,9 @@ int av_cold ff_celt_pvq_init(CeltPVQ **pvq) s->encode_band = pvq_encode_band; s->band_cost = pvq_band_cost; + if (ARCH_X86 && CONFIG_OPUS_ENCODER) + ff_opus_dsp_init_x86(s); + *pvq = s; return 0; diff --git a/libavcodec/opus_pvq.h b/libavcodec/opus_pvq.h index 6691494838..9246337360 100644 --- a/libavcodec/opus_pvq.h +++ b/libavcodec/opus_pvq.h @@ -33,8 +33,8 @@ float *lowband_scratch, int fill) struct CeltPVQ { - DECLARE_ALIGNED(32, int, qcoeff )[176]; - DECLARE_ALIGNED(32, float, hadamard_tmp)[176]; + DECLARE_ALIGNED(32, int, qcoeff )[256]; + DECLARE_ALIGNED(32, float, hadamard_tmp)[256]; float (*pvq_search)(float *X, int *y, int K, int N); @@ -45,6 +45,7 @@ struct CeltPVQ { }; int ff_celt_pvq_init (struct CeltPVQ **pvq); +void ff_opus_dsp_init_x86(struct CeltPVQ *s); void ff_celt_pvq_uninit(struct CeltPVQ **pvq); #endif /* AVCODEC_OPUS_PVQ_H */ diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 0dbc46504e..9875f48797 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -52,6 +52,7 @@ OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o x86/synth_filter_init.o OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o +OBJS-$(CONFIG_OPUS_ENCODER) += x86/opus_dsp_init.o OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp_init.o OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp_init.o @@ -123,6 +124,7 @@ X86ASM-OBJS-$(CONFIG_MDCT15) += x86/mdct15.o X86ASM-OBJS-$(CONFIG_ME_CMP) += x86/me_cmp.o X86ASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o X86ASM-OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoencdsp.o +X86ASM-OBJS-$(CONFIG_OPUS_ENCODER) += x86/opus_pvq_search.o X86ASM-OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp.o X86ASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \ x86/fpel.o \ diff --git a/libavcodec/x86/opus_dsp_init.c b/libavcodec/x86/opus_dsp_init.c new file mode 100644 index 0000000000..f4c25822db --- /dev/null +++ b/libavcodec/x86/opus_dsp_init.c @@ -0,0 +1,43 @@ +/* + * Opus encoder assembly optimizations + * Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/x86/cpu.h" +#include "libavcodec/opus_pvq.h" + +extern float ff_pvq_search_sse2(float *X, int *y, int K, int N); +extern float ff_pvq_search_sse4(float *X, int *y, int K, int N); +extern float ff_pvq_search_avx (float *X, int *y, int K, int N); + +av_cold void ff_opus_dsp_init_x86(CeltPVQ *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_SSE2(cpu_flags)) + s->pvq_search = ff_pvq_search_sse2; + + if (EXTERNAL_SSE4(cpu_flags)) + s->pvq_search = ff_pvq_search_sse4; + + if (EXTERNAL_AVX(cpu_flags)) + s->pvq_search = ff_pvq_search_avx; +} diff --git a/libavcodec/x86/opus_pvq_search.asm b/libavcodec/x86/opus_pvq_search.asm new file mode 100644 index 0000000000..359ad7a8a4 --- /dev/null +++ b/libavcodec/x86/opus_pvq_search.asm @@ -0,0 +1,538 @@ +;****************************************************************************** +;* SIMD optimized Opus encoder DSP function +;* +;* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "config.asm" +%include "libavutil/x86/x86util.asm" + +%ifdef __NASM_VER__ +%use "smartalign" +ALIGNMODE p6 +%endif + +; Use float op that give half precision but execute for around 3 cycles. +; On Skylake & Ryzen the division is much faster (around 11c/3), +; that makes the full precision code about 2% slower. +; Opus also does use rsqrt approximation in their intrinsics code. +%define USE_APPROXIMATION 01 + +; Presearch tries to quantize by using the property Sum( abs(x[i]*K)/Sx ) = K. +; If we use truncation for the division result then the integer Sum would be <= K. +; By using nearest rounding we get closer approximation, but +; we could also get more than K pulses and we have to remove the extra ones. +; This method is the main improvement of the ffmpeg C function over the Opus original. +%define PRESEARCH_ROUNDING 01 + + + +SECTION_RODATA 64 + +const_float_abs_mask: times 8 dd 0x7fffffff +const_align_abs_edge: times 8 dd 0 + +const_float_0_5: times 8 dd 0.5 +const_float_1: times 8 dd 1.0 +const_float_sign_mask: times 8 dd 0x80000000 + +const_int32_offsets: + %rep 8 + dd $-const_int32_offsets + %endrep +SECTION .text + + + + +; +; Horizontal Sum Packed Single precision float +; The resulting sum is in all elements. +; +%macro HSUMPS 2 ; dst/src, tmp +%if cpuflag(avx) + %if sizeof%1>=32 ; avx + vperm2f128 %2, %1, %1, (0)*16+(1) ; %2=lo(%1)<<128+hi(%1) + vaddps %1, %2 + %endif + vshufps %2, %1, %1, q1032 + vaddps %1, %2 + vshufps %2, %1, %1, q0321 + vaddps %1, %2 + +%else ; this form is a bit faster than the short avx-like emulation. + movaps %2, %1 ; [d, c, b, a ] + shufps %1, %1, q1032 ; %2=[b, a, d, c ] + addps %1, %2 ; %1=[b+d, a+c, d+b, c+a ] + movaps %2, %1 + shufps %1, %1, q0321 ; %2=[c+a, b+d, a+c, d+b ] + addps %1, %2 ; %1=[c+a+b+d, b+d+a+c, a+c+d+b, d+b+c+a ] + ; all %1 members should be equal for as long as float a+b==b+a +%endif +%endmacro + +; +; Emulate blendvps if not available +; +; src_b destroyed when using emulation with logical operands +; SSE41 blend instruction is hard coded to use xmm0 as mask +; +%macro EMU_blendvps 3 ; dst/src_a, src_b, mask +%if cpuflag(avx) + vblendvps %1, %1, %2, %3 +;------------------------- +%elif cpuflag(sse4) + %if notcpuflag(avx) + %ifnidn %3,xmm0 + %error sse41 blendvps uses xmm0 as default 3d operand, you used %3 + %endif + %endif + blendvps %1, %2, %3 +;------------------------- +%else + xorps %2, %1 + andps %2, %3 + xorps %1, %2 +%endif +%endmacro + +; +; Emulate pblendvb if not available +; +; src_b destroyed when using emulation with logical operands +; SSE41 blend instruction is hard coded to use xmm0 as mask +; +%macro EMU_pblendvb 3 ; dst/src_a, src_b, mask +%if cpuflag(avx) + %if cpuflag(avx) && notcpuflag(avx2) && sizeof%1 >= 32 + %error AVX1 does not support integer 256 bit ymm operations + %endif + + vpblendvb %1, %1, %2, %3 +;------------------------- +%elif cpuflag(sse4) + %ifnidn %3,xmm0 + %error sse41 blendvps uses xmm0 as default 3 operand, you used %3 + %endif + pblendvb %1, %2, %3 +;------------------------- +%else + pxor %2, %1 + pand %2, %3 + pxor %1, %2 +%endif +%endmacro + +; +; Emulate broadcastss if not available +; +%macro EMU_broadcastss 2 ; dst, src +%if cpuflag(avx2) + vbroadcastss %1, %2 ; ymm, xmm +;------------------------- +%elif cpuflag(avx) + %ifnum sizeof%2 ; avx1 register + vpermilps xmm%1, xmm%2, q0000 ; xmm, xmm, imm || ymm, ymm, imm + %if sizeof%1 >= 32 ; mmsize>=32 + vinsertf128 %1, %1,xmm%1, 1 ; ymm, ymm, xmm, im + %endif + %else ; avx1 memory + vbroadcastss %1, %2 ; ymm, mm32 || xmm, mm32 + %endif +;------------------------- +%else + %ifnum sizeof%2 ; sse register + shufps %1, %2, %2, q0000 + %else ; sse memory + movss %1, %2 ; this zeroes the other 3 elements + shufps %1, %1, 0 + %endif +%endif +%endmacro + +; +; Emulate pbroadcastd if not available +; +%macro EMU_pbroadcastd 2 ; dst, src +%if cpuflag(avx2) + vpbroadcastd %1, %2 +%elif cpuflag(avx) && mmsize >= 32 + %error AVX1 does not support integer 256 bit ymm operations +%else + %ifnum sizeof%2 ; sse2 register + pshufd %1, %2, q0000 + %else ; sse memory + movd %1, %2 ; movd zero extends + pshufd %1, %1, 0 + %endif +%endif +%endmacro + +; +; Setup High Register to be used +; for holding memory constants +; +; %1 - the register to be used, assmues it is >= mm8 +; %2 - name of the constant. +; +; Subsequent opcodes are going to use the constant in the form +; "addps m0, mm_const_name" and it would be turned into: +; "addps m0, [const_name]" on 32 bit arch or +; "addps m0, m8" on 64 bit arch +%macro SET_HI_REG_MM_CONSTANT 3 ; movop, reg, const_name +%if num_mmregs > 8 + %define mm_%3 %2 + %{1} %2, [%3] ; movaps m8, [const_name] +%else + %define mm_%3 [%3] +%endif +%endmacro + +; +; Set +; Position Independent Code +; Base address of a constant +; %1 - the register to be used, if PIC is set +; %2 - name of the constant. +; +; Subsequent opcode are going to use the base address in the form +; "movaps m0, [pic_base_constant_name+r4q]" and it would be turned into +; "movaps m0, [r5q + r4q]" if PIC is enabled +; "movaps m0, [constant_name + r4q]" if texrel are used +%macro SET_PIC_BASE 3; reg, const_label +%ifdef PIC + %{1} %2, [%3] ; lea r5q, [rip+const] + %define pic_base_%3 %2 +%else + %define pic_base_%3 %3 +%endif +%endmacro + + + + +%macro PULSES_SEARCH 1 +; m6 Syy_norm +; m7 Sxy_norm + addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2 + pxor m1, m1 ; max_idx + xorps m3, m3 ; p_max + xor r4q, r4q +align 16 +%%distortion_search: + movd xmm2, dword r4d ; movd zero extends +%ifidn %1,add + movaps m4, [tmpY + r4q] ; y[i] + movaps m5, [tmpX + r4q] ; X[i] + + %if USE_APPROXIMATION == 1 + xorps m0, m0 + cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0) + %endif + + addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm + addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm + + %if USE_APPROXIMATION == 1 + andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding. + %endif + +%else + movaps m5, [tmpY + r4q] ; m5 = y[i] + + xorps m0, m0 ; m0 = 0; + cmpps m0, m0, m5, 1 ; m0 = (0<y) + + subps m4, m6, m5 ; m4 = Syy_new = Syy_norm - y[i] + subps m5, m7, [tmpX + r4q] ; m5 = Sxy_new = Sxy_norm - X[i] + andps m5, m0 ; (0<y)?m5:0 +%endif + +%if USE_APPROXIMATION == 1 + rsqrtps m4, m4 + mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) ) +%else + mulps m5, m5 + divps m5, m4 ; m5 = p = Sxy_new*Sxy_new/Syy +%endif + EMU_pbroadcastd m2, xmm2 ; m2=i (all lanes get same values, we add the offset-per-lane, later) + + cmpps m0, m3, m5, 1 ; m0 = (m3 < m5) ; (p_max < p) ; (p > p_max) + maxps m3, m5 ; m3=max(p_max,p) + ; maxps here is faster than blendvps, despite blend having lower latency. + + pand m2, m0 ; This version seems faster than sse41 pblendvb + pmaxsw m1, m2 ; SSE2 signed word, so it would work for N < 32768/4 + + add r4q, mmsize + cmp r4q, Nq + jb %%distortion_search + + por m1, mm_const_int32_offsets ; max_idx offsets per individual lane (skipped in the inner loop) + movdqa m4, m1 ; needed for the aligned y[max_idx]+=1; processing + +%if mmsize >= 32 +; Merge parallel maximums round 8 (4 vs 4) + + vextractf128 xmm5, ymm3, 1 ; xmm5 = ymm3[1x128] = ymm3[255..128b] + vcmpps xmm0, xmm3, xmm5, 1 ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] ) + + vextracti128 xmm2, ymm1, 1 ; xmm2 = ymm1[1x128] = ymm1[255..128b] + EMU_blendvps xmm3, xmm5, xmm0 ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128] + EMU_pblendvb xmm1, xmm2, xmm0 ; p = m0 ? p[1x128] : p[0x128] +%endif + +; Merge parallel maximums round 4 (2 vs 2) + ; m3=p[3210] + movhlps xmm5, xmm3 ; m5=p[xx32] + cmpps xmm0, xmm3, xmm5, 1 ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] ) + + pshufd xmm2, xmm1, q3232 + EMU_blendvps xmm3, xmm5, xmm0 ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0] + EMU_pblendvb xmm1, xmm2, xmm0 ; p = m0 ? p[3,2] : p[1,0] + +; Merge parallel maximums final round (1 vs 1) + movaps xmm0, xmm3 ; m0 = m3[0] = p[0] + shufps xmm3, xmm3, q1111 ; m3 = m3[1] = p[1] + cmpless xmm0, xmm3 + + pshufd xmm2, xmm1, q1111 + EMU_pblendvb xmm1, xmm2, xmm0 + + movd dword r4d, xmm1 ; zero extends to the rest of r4q + + EMU_broadcastss m3, [tmpX + r4q] + %{1}ps m7, m3 ; Sxy += X[max_idx] + + EMU_broadcastss m5, [tmpY + r4q] + %{1}ps m6, m5 ; Syy += Y[max_idx] + + ; We have to update a single element in Y[i] + ; However writing 4 bytes and then doing 16 byte load in the inner loop + ; could cause a stall due to breaking write forwarding. + EMU_pbroadcastd m1, xmm1 + pcmpeqd m1, m1, m4 ; exactly 1 element matches max_idx and this finds it + + and r4q, ~(mmsize-1) ; align address down, so the value pointed by max_idx is inside a mmsize load + movaps m5, [tmpY + r4q] ; m5 = Y[y3...ym...y0] + andps m1, mm_const_float_1 ; m1 = [ 0...1.0...0] + %{1}ps m5, m1 ; m5 = Y[y3...ym...y0] +/- [0...1.0...0] + movaps [tmpY + r4q], m5 ; Y[max_idx] +-= 1.0; + +%endmacro + +; +; We need one more register for +; PIC relative addressing. Use this +; to count it in cglobal +; +%ifdef PIC + %define num_pic_regs 1 +%else + %define num_pic_regs 0 +%endif + +; +; In x86_64 mode few more xmm registers are used to hold constants. +; Use this to count them in cglobal +; +%if num_mmregs > 8 +%define num_hireg_used 3 +%endif + +; +; Pyramid Vector Quantization Search implementation +; +; float * inX - Unaligned (SIMD) access, it will be overread, +; but extra data is masked away. +; int32 * outY - Should be aligned and padded buffer. +; It is used as temp buffer. +; uint32 K - Number of pulses to have after quantizations. +; uint32 N - Number of vector elements. Must be 0 < N < 256 +; +%macro PVQ_FAST_SEARCH 0 +cglobal pvq_search, 4, 5+num_pic_regs, 8+num_hireg_used, 256*4, inX, outY, K, N +%define tmpX rsp +%define tmpY outYq + + + movaps m0, [const_float_abs_mask] + shl Nd, 2 ; N *= sizeof(float) ; also 32 bit operation zeroes the high 32 bits in 64 bit mode. + mov r4q, Nq + + neg r4q + and r4q, mmsize-1 + + SET_PIC_BASE lea, r5q, const_align_abs_edge ; rip+const + movups m3, [pic_base_const_align_abs_edge + r4q - mmsize] + + add Nq, r4q ; Nq = align(Nq, mmsize) + + lea r4q, [Nq - mmsize] ; Nq is rounded up (aligned up) to mmsize, so r4q can't become negative here, unless N=0. + movups m2, [inXq + r4q] + andps m2, m3 + movaps [tmpX + r4q], m2 + movaps m1, m2 ; Sx + +align 16 +%%loop_abs_sum: + sub r4q, mmsize + jc %%end_loop_abs_sum + + movups m2, [inXq + r4q] + andps m2, m0 + + movaps [tmpX + r4q], m2 ; tmpX[i]=abs(X[i]) + addps m1, m2 + jmp %%loop_abs_sum + +align 16 +%%end_loop_abs_sum: + + HSUMPS m1, m2 ; m1 = Sx + + xorps m0, m0 + comiss xmm0, xmm1 ; + jz %%zero_input ; if (Sx==0) goto zero_input + + cvtsi2ss xmm0, dword Kd ; m0 = K +%if USE_APPROXIMATION == 1 + rcpss xmm1, xmm1 ; m1 = approx(1/Sx) + mulss xmm0, xmm1 ; m0 = K*(1/Sx) +%else + divss xmm0, xmm1 ; b = K/Sx + ; b = K/max_x +%endif + + EMU_broadcastss m0, xmm0 + + lea r4q, [Nq - mmsize] + pxor m5, m5 ; Sy ( Sum of abs( y[i]) ) + xorps m6, m6 ; Syy ( Sum of y[i]*y[i] ) + xorps m7, m7 ; Sxy ( Sum of X[i]*y[i] ) +align 16 +%%loop_guess: + movaps m1, [tmpX + r4q] ; m1 = X[i] + mulps m2, m0, m1 ; m2 = res*X[i] + %if PRESEARCH_ROUNDING == 0 + cvttps2dq m2, m2 ; yt = (int)truncf( res*X[i] ) + %else + cvtps2dq m2, m2 ; yt = (int)lrintf( res*X[i] ) + %endif + paddd m5, m2 ; Sy += yt + cvtdq2ps m2, m2 ; yt = (float)yt + mulps m1, m2 ; m1 = X[i]*yt + movaps [tmpY + r4q], m2 ; y[i] = m2 + addps m7, m1 ; Sxy += m1; + mulps m2, m2 ; m2 = yt*yt + addps m6, m2 ; Syy += m2 + + sub r4q, mmsize + jnc %%loop_guess + + HSUMPS m6, m1 ; Syy_norm + HADDD m5, m4 ; pulses + + movd dword r4d, xmm5 ; zero extends to the rest of r4q + + sub Kd, r4d ; K -= pulses , also 32 bit operation zeroes high 32 bit in 64 bit mode. + jz %%finish ; K - pulses == 0 + + SET_HI_REG_MM_CONSTANT movaps, m8, const_float_0_5 + SET_HI_REG_MM_CONSTANT movaps, m9, const_float_1 + SET_HI_REG_MM_CONSTANT movdqa, m10, const_int32_offsets + ; Use Syy/2 in distortion parameter calculations. + ; Saves pre and post-caclulation to correct Y[] values. + ; Same precision, since float mantisa is normalized. + ; The SQRT approximation does differ. + HSUMPS m7, m0 ; Sxy_norm + mulps m6, mm_const_float_0_5 + + jc %%remove_pulses_loop ; K - pulses < 0 + +align 16 ; K - pulses > 0 +%%add_pulses_loop: + + PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm + + sub Kd, 1 + jnz %%add_pulses_loop + + addps m6, m6 ; Syy*=2 + + jmp %%finish + +align 16 +%%remove_pulses_loop: + + PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm + + add Kd, 1 + jnz %%remove_pulses_loop + + addps m6, m6 ; Syy*=2 + +align 16 +%%finish: + lea r4q, [Nq - mmsize] + + movaps m2, [const_float_sign_mask] +align 16 +%%restore_sign_loop: + movaps m0, [tmpY + r4q] ; m0 = Y[i] + movups m1, [inXq + r4q] ; m1 = X[i] + andps m1, m2 ; m1 = sign(X[i]) + orps m0, m1 ; m0 = Y[i]*sign + cvtps2dq m3, m0 ; m3 = (int)m0 + movaps [outYq + r4q], m3 + + sub r4q, mmsize + jnc %%restore_sign_loop +%%return: + +%if ARCH_X86_64 == 0 ;sbrdsp + movss r0m, xmm6 ; return (float)Syy_norm + fld dword r0m +%else + movaps m0, m6 ; return (float)Syy_norm +%endif + RET + +align 16 +%%zero_input: + lea r4q, [Nq - mmsize] + xorps m0, m0 +%%zero_loop: + movaps [outYq + r4q], m0 + sub r4q, mmsize + jnc %%zero_loop + + movaps m6, [const_float_1] + jmp %%return +%endmacro + + +INIT_XMM sse2 +PVQ_FAST_SEARCH + +INIT_XMM sse4 +PVQ_FAST_SEARCH + +INIT_XMM avx +PVQ_FAST_SEARCH -- 2.13.2