Message ID | 20171126225111.5108-5-james.darnley@gmail.com |
---|---|
State | New |
Headers | show |
On 26 November 2017 at 22:51, James Darnley <james.darnley@gmail.com> wrote: > Now does 6 samples per iteration, up from 2. > > From 1.6 to 2.1 times faster again. 2.5 to 3.9 times faster overall. > Runtime is reduced by a further 4 to 17%. Reduced by 9 to 65% overall. > > Same conditions as previously. > --- > libavcodec/x86/flac_dsp_gpl.asm | 30 +++++++++++++++++++++++++----- > 1 file changed, 25 insertions(+), 5 deletions(-) > > diff --git a/libavcodec/x86/flac_dsp_gpl.asm > b/libavcodec/x86/flac_dsp_gpl.asm > index 618306eb5f..4d212ed212 100644 > --- a/libavcodec/x86/flac_dsp_gpl.asm > +++ b/libavcodec/x86/flac_dsp_gpl.asm > @@ -152,13 +152,13 @@ RET > %macro FUNCTION_BODY_32 0 > > %if ARCH_X86_64 > - cglobal flac_enc_lpc_32, 5, 7, 4, mmsize, res, smp, len, order, coefs > + cglobal flac_enc_lpc_32, 5, 7, 8, mmsize, res, smp, len, order, coefs > DECLARE_REG_TMP 5, 6 > %define length r2d > > movsxd orderq, orderd > %else > - cglobal flac_enc_lpc_32, 5, 6, 4, mmsize, res, smp, len, order, coefs > + cglobal flac_enc_lpc_32, 5, 6, 8, mmsize, res, smp, len, order, coefs > DECLARE_REG_TMP 2, 5 > %define length r2mp > %endif > @@ -190,6 +190,8 @@ mova [rsp], m4 ; save sign extend mask > > .looplen: > pxor m0, m0 > + pxor m4, m4 > + pxor m6, m6 > mov posj, orderq > xor negj, negj > > @@ -197,23 +199,41 @@ mova [rsp], m4 ; save sign extend mask > movd m2, [coefsq+posj*4] ; c = coefs[j] > SPLATD m2 > pmovzxdq m1, [smpq+negj*4-4] ; s = smp[i-j-1] > + pmovzxdq m5, [smpq+negj*4-4+mmsize/2] > + pmovzxdq m7, [smpq+negj*4-4+mmsize] > pmuldq m1, m2 > + pmuldq m5, m2 > + pmuldq m7, m2 > paddq m0, m1 ; p += c * s > + paddq m4, m5 > + paddq m6, m7 > > dec negj > inc posj > jnz .looporder > > HACK_PSRAQ m0, m3, [rsp], m2 ; p >>= shift > + HACK_PSRAQ m4, m3, [rsp], m2 > + HACK_PSRAQ m6, m3, [rsp], m2 > CLIPQ m0, [pq_int_min], [pq_int_max], m2 ; clip(p >> shift) > + CLIPQ m4, [pq_int_min], [pq_int_max], m2 > + CLIPQ m6, [pq_int_min], [pq_int_max], m2 > pshufd m0, m0, q0020 ; pack into first 2 dwords > + pshufd m4, m4, q0020 > + pshufd m6, m6, q0020 > movh m1, [smpq] > + movh m5, [smpq+mmsize/2] > + movh m7, [smpq+mmsize] > psubd m1, m0 ; smp[i] - p > + psubd m5, m4 > + psubd m7, m6 > movh [resq], m1 ; res[i] = smp[i] - (p >> shift) > + movh [resq+mmsize/2], m5 > + movh [resq+mmsize], m7 > > - add resq, mmsize/2 > - add smpq, mmsize/2 > - sub length, mmsize/8 > + add resq, (3*mmsize)/2 > + add smpq, (3*mmsize)/2 > + sub length, (3*mmsize)/8 > jg .looplen > RET > > -- > 2.15.0 > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel > lgtm, tnx
diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm index 618306eb5f..4d212ed212 100644 --- a/libavcodec/x86/flac_dsp_gpl.asm +++ b/libavcodec/x86/flac_dsp_gpl.asm @@ -152,13 +152,13 @@ RET %macro FUNCTION_BODY_32 0 %if ARCH_X86_64 - cglobal flac_enc_lpc_32, 5, 7, 4, mmsize, res, smp, len, order, coefs + cglobal flac_enc_lpc_32, 5, 7, 8, mmsize, res, smp, len, order, coefs DECLARE_REG_TMP 5, 6 %define length r2d movsxd orderq, orderd %else - cglobal flac_enc_lpc_32, 5, 6, 4, mmsize, res, smp, len, order, coefs + cglobal flac_enc_lpc_32, 5, 6, 8, mmsize, res, smp, len, order, coefs DECLARE_REG_TMP 2, 5 %define length r2mp %endif @@ -190,6 +190,8 @@ mova [rsp], m4 ; save sign extend mask .looplen: pxor m0, m0 + pxor m4, m4 + pxor m6, m6 mov posj, orderq xor negj, negj @@ -197,23 +199,41 @@ mova [rsp], m4 ; save sign extend mask movd m2, [coefsq+posj*4] ; c = coefs[j] SPLATD m2 pmovzxdq m1, [smpq+negj*4-4] ; s = smp[i-j-1] + pmovzxdq m5, [smpq+negj*4-4+mmsize/2] + pmovzxdq m7, [smpq+negj*4-4+mmsize] pmuldq m1, m2 + pmuldq m5, m2 + pmuldq m7, m2 paddq m0, m1 ; p += c * s + paddq m4, m5 + paddq m6, m7 dec negj inc posj jnz .looporder HACK_PSRAQ m0, m3, [rsp], m2 ; p >>= shift + HACK_PSRAQ m4, m3, [rsp], m2 + HACK_PSRAQ m6, m3, [rsp], m2 CLIPQ m0, [pq_int_min], [pq_int_max], m2 ; clip(p >> shift) + CLIPQ m4, [pq_int_min], [pq_int_max], m2 + CLIPQ m6, [pq_int_min], [pq_int_max], m2 pshufd m0, m0, q0020 ; pack into first 2 dwords + pshufd m4, m4, q0020 + pshufd m6, m6, q0020 movh m1, [smpq] + movh m5, [smpq+mmsize/2] + movh m7, [smpq+mmsize] psubd m1, m0 ; smp[i] - p + psubd m5, m4 + psubd m7, m6 movh [resq], m1 ; res[i] = smp[i] - (p >> shift) + movh [resq+mmsize/2], m5 + movh [resq+mmsize], m7 - add resq, mmsize/2 - add smpq, mmsize/2 - sub length, mmsize/8 + add resq, (3*mmsize)/2 + add smpq, (3*mmsize)/2 + sub length, (3*mmsize)/8 jg .looplen RET