Message ID | 20171126225111.5108-3-james.darnley@gmail.com |
---|---|
State | New |
Headers | show |
On 26 November 2017 at 22:51, James Darnley <james.darnley@gmail.com> wrote: > When compared to the SSE4 version, runtime is reduced by 0.5 to 20%. > After a bug fix log, long ago in e609cfd697 the 16-bit lpc encoder is > used so little that the runtime reduction is no longer correct. The > function itself is around 2 times faster. (As one might expect for > doing twice as many samples every iteration.) > --- > libavcodec/flacenc.c | 2 +- > libavcodec/x86/flac_dsp_gpl.asm | 32 +++++++++++++++++++++++++++----- > libavcodec/x86/flacdsp_init.c | 5 +++++ > 3 files changed, 33 insertions(+), 6 deletions(-) > > diff --git a/libavcodec/flacenc.c b/libavcodec/flacenc.c > index 170c3caf48..cf25982c91 100644 > --- a/libavcodec/flacenc.c > +++ b/libavcodec/flacenc.c > @@ -88,7 +88,7 @@ typedef struct FlacSubframe { > uint64_t rc_sums[32][MAX_PARTITIONS]; > > int32_t samples[FLAC_MAX_BLOCKSIZE]; > - int32_t residual[FLAC_MAX_BLOCKSIZE+11]; > + int32_t residual[FLAC_MAX_BLOCKSIZE+23]; > } FlacSubframe; > > typedef struct FlacFrame { > diff --git a/libavcodec/x86/flac_dsp_gpl.asm > b/libavcodec/x86/flac_dsp_gpl.asm > index e285158185..c461c666be 100644 > --- a/libavcodec/x86/flac_dsp_gpl.asm > +++ b/libavcodec/x86/flac_dsp_gpl.asm > @@ -24,7 +24,8 @@ > > SECTION .text > > -INIT_XMM sse4 > +%macro FUNCTION_BODY_16 0 > + > %if ARCH_X86_64 > cglobal flac_enc_lpc_16, 5, 7, 8, 0, res, smp, len, order, coefs > DECLARE_REG_TMP 5, 6 > @@ -51,7 +52,7 @@ lea resq, [resq+orderq*4] > lea smpq, [smpq+orderq*4] > lea coefsq, [coefsq+orderq*4] > sub length, orderd > -movd m3, r5m > +movd xm3, r5m > neg orderq > > %define posj t0q > @@ -65,8 +66,20 @@ neg orderq > xor negj, negj > > .looporder: > +%if cpuflag(avx) > + vbroadcastss m2, [coefsq+posj*4] > +%else > movd m2, [coefsq+posj*4] ; c = coefs[j] > SPLATD m2 > +%endif > +%if cpuflag(avx) > + vpmulld m1, m2, [smpq+negj*4-4] > + vpmulld m5, m2, [smpq+negj*4-4+mmsize] > + vpmulld m7, m2, [smpq+negj*4-4+mmsize*2] > + vpaddd m0, m1 > + vpaddd m4, m5 > + vpaddd m6, m7 > Same as the 32bit lpc avx2 patch > +%else > movu m1, [smpq+negj*4-4] ; s = smp[i-j-1] > movu m5, [smpq+negj*4-4+mmsize] > movu m7, [smpq+negj*4-4+mmsize*2] > @@ -76,14 +89,15 @@ neg orderq > paddd m0, m1 ; p += c * s > paddd m4, m5 > paddd m6, m7 > +%endif > > dec negj > inc posj > jnz .looporder > > - psrad m0, m3 ; p >>= shift > - psrad m4, m3 > - psrad m6, m3 > + psrad m0, xm3 ; p >>= shift > + psrad m4, xm3 > + psrad m6, xm3 > movu m1, [smpq] > movu m5, [smpq+mmsize] > movu m7, [smpq+mmsize*2] > @@ -99,3 +113,11 @@ neg orderq > sub length, (3*mmsize)/4 > jg .looplen > RET > + > +%endmacro > + > +INIT_XMM sse4 > +FUNCTION_BODY_16 > + > +INIT_YMM avx2 > +FUNCTION_BODY_16 > diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c > index 1971f81b8d..0a5c01859f 100644 > --- a/libavcodec/x86/flacdsp_init.c > +++ b/libavcodec/x86/flacdsp_init.c > @@ -28,6 +28,7 @@ void ff_flac_lpc_32_xop(int32_t *samples, const int > coeffs[32], int order, > int qlevel, int len); > > void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const > int32_t *,int); > +void ff_flac_enc_lpc_16_avx2(int32_t *, const int32_t *, int, int, const > int32_t *,int); > > #define DECORRELATE_FUNCS(fmt, opt) > \ > void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, > int channels, \ > @@ -110,6 +111,10 @@ av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, > enum AVSampleFormat fmt, int > if (CONFIG_GPL) > c->lpc16_encode = ff_flac_enc_lpc_16_sse4; > } > + if (EXTERNAL_AVX2(cpu_flags)) { > + if (CONFIG_GPL) > yeah, just combine them, if someone wants to add non-gpl asm this is the least of their problems > + c->lpc16_encode = ff_flac_enc_lpc_16_avx2; > + } > #endif > #endif /* HAVE_X86ASM */ > } > -- > 2.15.0 > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel >
diff --git a/libavcodec/flacenc.c b/libavcodec/flacenc.c index 170c3caf48..cf25982c91 100644 --- a/libavcodec/flacenc.c +++ b/libavcodec/flacenc.c @@ -88,7 +88,7 @@ typedef struct FlacSubframe { uint64_t rc_sums[32][MAX_PARTITIONS]; int32_t samples[FLAC_MAX_BLOCKSIZE]; - int32_t residual[FLAC_MAX_BLOCKSIZE+11]; + int32_t residual[FLAC_MAX_BLOCKSIZE+23]; } FlacSubframe; typedef struct FlacFrame { diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm index e285158185..c461c666be 100644 --- a/libavcodec/x86/flac_dsp_gpl.asm +++ b/libavcodec/x86/flac_dsp_gpl.asm @@ -24,7 +24,8 @@ SECTION .text -INIT_XMM sse4 +%macro FUNCTION_BODY_16 0 + %if ARCH_X86_64 cglobal flac_enc_lpc_16, 5, 7, 8, 0, res, smp, len, order, coefs DECLARE_REG_TMP 5, 6 @@ -51,7 +52,7 @@ lea resq, [resq+orderq*4] lea smpq, [smpq+orderq*4] lea coefsq, [coefsq+orderq*4] sub length, orderd -movd m3, r5m +movd xm3, r5m neg orderq %define posj t0q @@ -65,8 +66,20 @@ neg orderq xor negj, negj .looporder: +%if cpuflag(avx) + vbroadcastss m2, [coefsq+posj*4] +%else movd m2, [coefsq+posj*4] ; c = coefs[j] SPLATD m2 +%endif +%if cpuflag(avx) + vpmulld m1, m2, [smpq+negj*4-4] + vpmulld m5, m2, [smpq+negj*4-4+mmsize] + vpmulld m7, m2, [smpq+negj*4-4+mmsize*2] + vpaddd m0, m1 + vpaddd m4, m5 + vpaddd m6, m7 +%else movu m1, [smpq+negj*4-4] ; s = smp[i-j-1] movu m5, [smpq+negj*4-4+mmsize] movu m7, [smpq+negj*4-4+mmsize*2] @@ -76,14 +89,15 @@ neg orderq paddd m0, m1 ; p += c * s paddd m4, m5 paddd m6, m7 +%endif dec negj inc posj jnz .looporder - psrad m0, m3 ; p >>= shift - psrad m4, m3 - psrad m6, m3 + psrad m0, xm3 ; p >>= shift + psrad m4, xm3 + psrad m6, xm3 movu m1, [smpq] movu m5, [smpq+mmsize] movu m7, [smpq+mmsize*2] @@ -99,3 +113,11 @@ neg orderq sub length, (3*mmsize)/4 jg .looplen RET + +%endmacro + +INIT_XMM sse4 +FUNCTION_BODY_16 + +INIT_YMM avx2 +FUNCTION_BODY_16 diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c index 1971f81b8d..0a5c01859f 100644 --- a/libavcodec/x86/flacdsp_init.c +++ b/libavcodec/x86/flacdsp_init.c @@ -28,6 +28,7 @@ void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order, int qlevel, int len); void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const int32_t *,int); +void ff_flac_enc_lpc_16_avx2(int32_t *, const int32_t *, int, int, const int32_t *,int); #define DECORRELATE_FUNCS(fmt, opt) \ void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \ @@ -110,6 +111,10 @@ av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int if (CONFIG_GPL) c->lpc16_encode = ff_flac_enc_lpc_16_sse4; } + if (EXTERNAL_AVX2(cpu_flags)) { + if (CONFIG_GPL) + c->lpc16_encode = ff_flac_enc_lpc_16_avx2; + } #endif #endif /* HAVE_X86ASM */ }