Message ID | 20171126225111.5108-7-james.darnley@gmail.com |
---|---|
State | New |
Headers | show |
On 26 November 2017 at 22:51, James Darnley <james.darnley@gmail.com> wrote: > Around 1.1 times faster and reduces runtime by up to 6%. > --- > libavcodec/x86/flac_dsp_gpl.asm | 91 ++++++++++++++++++++++++++++++ > ++--------- > 1 file changed, 72 insertions(+), 19 deletions(-) > > diff --git a/libavcodec/x86/flac_dsp_gpl.asm > b/libavcodec/x86/flac_dsp_gpl.asm > index 952fc8b86b..91989ce560 100644 > --- a/libavcodec/x86/flac_dsp_gpl.asm > +++ b/libavcodec/x86/flac_dsp_gpl.asm > @@ -152,13 +152,13 @@ RET > %macro FUNCTION_BODY_32 0 > > %if ARCH_X86_64 > - cglobal flac_enc_lpc_32, 5, 7, 8, mmsize, res, smp, len, order, coefs > + cglobal flac_enc_lpc_32, 5, 7, 8, mmsize*4, res, smp, len, order, > coefs > Why x4, shouldn't this be x2? > DECLARE_REG_TMP 5, 6 > %define length r2d > > movsxd orderq, orderd > %else > - cglobal flac_enc_lpc_32, 5, 6, 8, mmsize, res, smp, len, order, coefs > + cglobal flac_enc_lpc_32, 5, 6, 8, mmsize*4, res, smp, len, order, > coefs > DECLARE_REG_TMP 2, 5 > %define length r2mp > %endif > @@ -189,18 +189,23 @@ mova [rsp], m4 ; save sign extend mask > %define negj t1q > > .looplen: > + ; process "odd" samples > pxor m0, m0 > pxor m4, m4 > pxor m6, m6 > mov posj, orderq > xor negj, negj > > - .looporder: > + .looporder1: > movd m2, [coefsq+posj*4] ; c = coefs[j] > SPLATD m2 > - pmovzxdq m1, [smpq+negj*4-4] ; s = smp[i-j-1] > - pmovzxdq m5, [smpq+negj*4-4+mmsize/2] > - pmovzxdq m7, [smpq+negj*4-4+mmsize] > + movu m1, [smpq+negj*4-4] ; s = smp[i-j-1] > + movu m5, [smpq+negj*4-4+mmsize] > + movu m7, [smpq+negj*4-4+mmsize*2] > + ; Rather than explicitly unpack adjacent samples into qwords we > can let > + ; the pmuldq instruction unpack the 0th and 2nd samples for us > when it > + ; does its multiply. This saves an unpack for every sample in > the inner > + ; loop meaning it should be (much) quicker. > pmuldq m1, m2 > pmuldq m5, m2 > pmuldq m7, m2 > @@ -210,7 +215,7 @@ mova [rsp], m4 ; save sign extend mask > > dec negj > inc posj > - jnz .looporder > + jnz .looporder1 > > HACK_PSRAQ m0, m3, [rsp], m2 ; p >>= shift > HACK_PSRAQ m4, m3, [rsp], m2 > @@ -218,22 +223,70 @@ mova [rsp], m4 ; save sign extend mask > CLIPQ m0, [pq_int_min], [pq_int_max], m2 ; clip(p >> shift) > CLIPQ m4, [pq_int_min], [pq_int_max], m2 > CLIPQ m6, [pq_int_min], [pq_int_max], m2 > - pshufd m0, m0, q0020 ; pack into first 2 dwords > - pshufd m4, m4, q0020 > - pshufd m6, m6, q0020 > - movh m1, [smpq] > - movh m5, [smpq+mmsize/2] > - movh m7, [smpq+mmsize] > + movu m1, [smpq] > + movu m5, [smpq+mmsize] > + movu m7, [smpq+mmsize*2] > psubd m1, m0 ; smp[i] - p > psubd m5, m4 > psubd m7, m6 > - movh [resq], m1 ; res[i] = smp[i] - (p >> shift) > - movh [resq+mmsize/2], m5 > - movh [resq+mmsize], m7 > + mova [rsp+mmsize], m1 ; res[i] = smp[i] - (p >> shift) > + mova [rsp+mmsize*2], m5 > + mova [rsp+mmsize*3], m7 > + > + ; process "even" samples > + pxor m0, m0 > + pxor m4, m4 > + pxor m6, m6 > + mov posj, orderq > + xor negj, negj > + > + .looporder2: > + movd m2, [coefsq+posj*4] ; c = coefs[j] > + SPLATD m2 > + movu m1, [smpq+negj*4] ; s = smp[i-j-1] > + movu m5, [smpq+negj*4+mmsize] > + movu m7, [smpq+negj*4+mmsize*2] > + pmuldq m1, m2 > + pmuldq m5, m2 > + pmuldq m7, m2 > + paddq m0, m1 ; p += c * s > + paddq m4, m5 > + paddq m6, m7 > + > + dec negj > + inc posj > + jnz .looporder2 > + > + HACK_PSRAQ m0, m3, [rsp], m2 ; p >>= shift > + HACK_PSRAQ m4, m3, [rsp], m2 > + HACK_PSRAQ m6, m3, [rsp], m2 > + CLIPQ m0, [pq_int_min], [pq_int_max], m2 ; clip(p >> shift) > + CLIPQ m4, [pq_int_min], [pq_int_max], m2 > + CLIPQ m6, [pq_int_min], [pq_int_max], m2 > + movu m1, [smpq+4] > + movu m5, [smpq+4+mmsize] > + movu m7, [smpq+4+mmsize*2] > + psubd m1, m0 ; smp[i] - p > + psubd m5, m4 > + psubd m7, m6 > + > + ; interleave odd and even samples > + pslldq m1, 4 > + pslldq m5, 4 > + pslldq m7, 4 > + > + pblendw m1, [rsp+mmsize], q0303 > + pblendw m5, [rsp+mmsize*2], q0303 > + pblendw m7, [rsp+mmsize*3], q0303 > + > + movu [resq], m1 > + movu [resq+mmsize], m5 > + movu [resq+mmsize*2], m7 > + > + add resq, 3*mmsize > + add smpq, 3*mmsize > + sub length, (3*mmsize)/4 > > - add resq, (3*mmsize)/2 > - add smpq, (3*mmsize)/2 > - sub length, (3*mmsize)/8 > jg .looplen > RET > > -- > 2.15.0 > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel > Apart from that lgtm
On 2017-11-27 00:17, Rostislav Pehlivanov wrote: > On 26 November 2017 at 22:51, James Darnley <james.darnley@gmail.com> wrote: >> @@ -152,13 +152,13 @@ RET >> %macro FUNCTION_BODY_32 0 >> >> %if ARCH_X86_64 >> - cglobal flac_enc_lpc_32, 5, 7, 8, mmsize, res, smp, len, order, coefs >> + cglobal flac_enc_lpc_32, 5, 7, 8, mmsize*4, res, smp, len, order, >> coefs >> > > Why x4, shouldn't this be x2? I write 3 mm registers more to the stack. The first one is the sign extension for my hacked qword arithmetic shift added in the first 32-bit patch. The new 3 are to store the "odd" values created in the first inner loop. I admit that this is a rather ugly construction for a little speed gain but I think I've seen other ugly things since writing this.
diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm index 952fc8b86b..91989ce560 100644 --- a/libavcodec/x86/flac_dsp_gpl.asm +++ b/libavcodec/x86/flac_dsp_gpl.asm @@ -152,13 +152,13 @@ RET %macro FUNCTION_BODY_32 0 %if ARCH_X86_64 - cglobal flac_enc_lpc_32, 5, 7, 8, mmsize, res, smp, len, order, coefs + cglobal flac_enc_lpc_32, 5, 7, 8, mmsize*4, res, smp, len, order, coefs DECLARE_REG_TMP 5, 6 %define length r2d movsxd orderq, orderd %else - cglobal flac_enc_lpc_32, 5, 6, 8, mmsize, res, smp, len, order, coefs + cglobal flac_enc_lpc_32, 5, 6, 8, mmsize*4, res, smp, len, order, coefs DECLARE_REG_TMP 2, 5 %define length r2mp %endif @@ -189,18 +189,23 @@ mova [rsp], m4 ; save sign extend mask %define negj t1q .looplen: + ; process "odd" samples pxor m0, m0 pxor m4, m4 pxor m6, m6 mov posj, orderq xor negj, negj - .looporder: + .looporder1: movd m2, [coefsq+posj*4] ; c = coefs[j] SPLATD m2 - pmovzxdq m1, [smpq+negj*4-4] ; s = smp[i-j-1] - pmovzxdq m5, [smpq+negj*4-4+mmsize/2] - pmovzxdq m7, [smpq+negj*4-4+mmsize] + movu m1, [smpq+negj*4-4] ; s = smp[i-j-1] + movu m5, [smpq+negj*4-4+mmsize] + movu m7, [smpq+negj*4-4+mmsize*2] + ; Rather than explicitly unpack adjacent samples into qwords we can let + ; the pmuldq instruction unpack the 0th and 2nd samples for us when it + ; does its multiply. This saves an unpack for every sample in the inner + ; loop meaning it should be (much) quicker. pmuldq m1, m2 pmuldq m5, m2 pmuldq m7, m2 @@ -210,7 +215,7 @@ mova [rsp], m4 ; save sign extend mask dec negj inc posj - jnz .looporder + jnz .looporder1 HACK_PSRAQ m0, m3, [rsp], m2 ; p >>= shift HACK_PSRAQ m4, m3, [rsp], m2 @@ -218,22 +223,70 @@ mova [rsp], m4 ; save sign extend mask CLIPQ m0, [pq_int_min], [pq_int_max], m2 ; clip(p >> shift) CLIPQ m4, [pq_int_min], [pq_int_max], m2 CLIPQ m6, [pq_int_min], [pq_int_max], m2 - pshufd m0, m0, q0020 ; pack into first 2 dwords - pshufd m4, m4, q0020 - pshufd m6, m6, q0020 - movh m1, [smpq] - movh m5, [smpq+mmsize/2] - movh m7, [smpq+mmsize] + movu m1, [smpq] + movu m5, [smpq+mmsize] + movu m7, [smpq+mmsize*2] psubd m1, m0 ; smp[i] - p psubd m5, m4 psubd m7, m6 - movh [resq], m1 ; res[i] = smp[i] - (p >> shift) - movh [resq+mmsize/2], m5 - movh [resq+mmsize], m7 + mova [rsp+mmsize], m1 ; res[i] = smp[i] - (p >> shift) + mova [rsp+mmsize*2], m5 + mova [rsp+mmsize*3], m7 + + ; process "even" samples + pxor m0, m0 + pxor m4, m4 + pxor m6, m6 + mov posj, orderq + xor negj, negj + + .looporder2: + movd m2, [coefsq+posj*4] ; c = coefs[j] + SPLATD m2 + movu m1, [smpq+negj*4] ; s = smp[i-j-1] + movu m5, [smpq+negj*4+mmsize] + movu m7, [smpq+negj*4+mmsize*2] + pmuldq m1, m2 + pmuldq m5, m2 + pmuldq m7, m2 + paddq m0, m1 ; p += c * s + paddq m4, m5 + paddq m6, m7 + + dec negj + inc posj + jnz .looporder2 + + HACK_PSRAQ m0, m3, [rsp], m2 ; p >>= shift + HACK_PSRAQ m4, m3, [rsp], m2 + HACK_PSRAQ m6, m3, [rsp], m2 + CLIPQ m0, [pq_int_min], [pq_int_max], m2 ; clip(p >> shift) + CLIPQ m4, [pq_int_min], [pq_int_max], m2 + CLIPQ m6, [pq_int_min], [pq_int_max], m2 + movu m1, [smpq+4] + movu m5, [smpq+4+mmsize] + movu m7, [smpq+4+mmsize*2] + psubd m1, m0 ; smp[i] - p + psubd m5, m4 + psubd m7, m6 + + ; interleave odd and even samples + pslldq m1, 4 + pslldq m5, 4 + pslldq m7, 4 + + pblendw m1, [rsp+mmsize], q0303 + pblendw m5, [rsp+mmsize*2], q0303 + pblendw m7, [rsp+mmsize*3], q0303 + + movu [resq], m1 + movu [resq+mmsize], m5 + movu [resq+mmsize*2], m7 + + add resq, 3*mmsize + add smpq, 3*mmsize + sub length, (3*mmsize)/4 - add resq, (3*mmsize)/2 - add smpq, (3*mmsize)/2 - sub length, (3*mmsize)/8 jg .looplen RET