Message ID | 20240526014207.2697057-1-dev@lynne.ee |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,v2] lpc: rewrite lpc_compute_autocorr in external asm | expand |
Context | Check | Description |
---|---|---|
yinshiyou/make_loongarch64 | success | Make finished |
yinshiyou/make_fate_loongarch64 | success | Make fate finished |
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
On 5/25/2024 10:42 PM, Lynne via ffmpeg-devel wrote: > The inline asm function had issues running under checkasm. > So I came to finish what I started, and wrote the last part > of LPC computation in assembly. > --- > libavcodec/x86/lpc.asm | 91 +++++++++++++++++++++++++++++++++++++++ > libavcodec/x86/lpc_init.c | 87 ++++--------------------------------- > 2 files changed, 100 insertions(+), 78 deletions(-) > > diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm > index a585c17ef5..9c359ae480 100644 > --- a/libavcodec/x86/lpc.asm > +++ b/libavcodec/x86/lpc.asm > @@ -261,3 +261,94 @@ APPLY_WELCH_FN > INIT_YMM avx2 > APPLY_WELCH_FN > %endif > + > +%macro COMPUTE_AUTOCORR_FN 0 > +cglobal lpc_compute_autocorr, 4, 7, 3, data, len, lag, autoc, lag_p, data_l, len_p > + shl lagd, 3 > + shl lenq, 3 > + xor lag_pq, lag_pq > + > +.lag_l: > + movaps m2, [one_tab] Super nit: movapd > + > + mov len_pq, lag_pq > + > + lea data_lq, [lag_pq + mmsize - 8] > + neg data_lq ; -j - mmsize > + add data_lq, dataq ; data[-j - mmsize] > +.len_l: > + > +%if mmsize == 32 > + vbroadcastsd m0, [dataq + len_pq] > + vpermpd m1, [data_lq + len_pq], q0123 > +%else > + movupd m1, [data_lq + len_pq] ; data[i - j] > + movsd xm0, [dataq + len_pq] ; data[i] > + shufpd m1, m1, m1, 01b > +%endif > + > + shufpd m0, m0, m0, 1100b This is not needed for mmsize == 32. The broadcast set every qword to the value movsd loaded. > + > + ; fmadd actually hurts performance in this case due to > + ; the earlier loads + shuffles > + mulpd m0, m1 > + addpd m2, m0 ; sum += data[i]*data[i-j] > + > + add len_pq, 8 > + cmp len_pq, lenq > + jl .len_l > + > + movupd [autocq + lag_pq], m2 ; autoc[j] = sum > + add lag_pq, mmsize > + cmp lag_pq, lagq > + jl .lag_l > + > + ; The tail computation is guaranteed never to happen > + ; as long as we're doing multiples of 4, rather than 2. > +%if mmsize != 32 > + jg .end > + ; If lag_p == lag fallthrough > + > +.tail: > + movaps m2, [one_tab] > + > + mov len_pq, lag_pq > + sub len_pq, mmsize > + > + lea data_lq, [lag_pq] > + neg data_lq ; -j > + add data_lq, dataq ; data[-j] > + > +.tail_l: > + movupd m0, [dataq + len_pq] > + movupd m1, [data_lq + len_pq] > + > + mulpd m0, m1 > + addpd m2, m0 ; sum += data[i]*data[i-j] > + > + add len_pq, mmsize > + cmp len_pq, lenq > + jl .tail_l > + > + shufpd m1, m2, m2, 01b > + addpd m2, m1 > + > + ; Leave this here just in case its ever needed > +%if mmsize == 32 > + vperm2f128 m1, m2, m2, 0x01 > + addpd xm2, xm1 > + movupd [autocq + lag_pq], xm2 > +%else > + movhpd [autocq + lag_pq], xm2 > +%endif > + > +.end: > +%endif > + > + RET > +%endmacro > + > +INIT_XMM sse2 > +COMPUTE_AUTOCORR_FN > +INIT_YMM avx vpermpd is avx2, so it needs to be that. > +COMPUTE_AUTOCORR_FN > diff --git a/libavcodec/x86/lpc_init.c b/libavcodec/x86/lpc_init.c > index f2fca53799..bb174be53e 100644 > --- a/libavcodec/x86/lpc_init.c > +++ b/libavcodec/x86/lpc_init.c > @@ -28,89 +28,20 @@ void ff_lpc_apply_welch_window_sse2(const int32_t *data, ptrdiff_t len, > double *w_data); > void ff_lpc_apply_welch_window_avx2(const int32_t *data, ptrdiff_t len, > double *w_data); > - > -DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 }; > - > -#if HAVE_SSE2_INLINE > - > -static void lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, int lag, > - double *autoc) > -{ > - int j; > - > - if((x86_reg)data & 15) > - data++; > - > - for(j=0; j<lag; j+=2){ > - x86_reg i = -len*sizeof(double); > - if(j == lag-2) { > - __asm__ volatile( > - "movsd "MANGLE(pd_1)", %%xmm0 \n\t" > - "movsd "MANGLE(pd_1)", %%xmm1 \n\t" > - "movsd "MANGLE(pd_1)", %%xmm2 \n\t" > - "1: \n\t" > - "movapd (%2,%0), %%xmm3 \n\t" > - "movupd -8(%3,%0), %%xmm4 \n\t" > - "movapd (%3,%0), %%xmm5 \n\t" > - "mulpd %%xmm3, %%xmm4 \n\t" > - "mulpd %%xmm3, %%xmm5 \n\t" > - "mulpd -16(%3,%0), %%xmm3 \n\t" > - "addpd %%xmm4, %%xmm1 \n\t" > - "addpd %%xmm5, %%xmm0 \n\t" > - "addpd %%xmm3, %%xmm2 \n\t" > - "add $16, %0 \n\t" > - "jl 1b \n\t" > - "movhlps %%xmm0, %%xmm3 \n\t" > - "movhlps %%xmm1, %%xmm4 \n\t" > - "movhlps %%xmm2, %%xmm5 \n\t" > - "addsd %%xmm3, %%xmm0 \n\t" > - "addsd %%xmm4, %%xmm1 \n\t" > - "addsd %%xmm5, %%xmm2 \n\t" > - "movsd %%xmm0, (%1) \n\t" > - "movsd %%xmm1, 8(%1) \n\t" > - "movsd %%xmm2, 16(%1) \n\t" > - :"+&r"(i) > - :"r"(autoc+j), "r"(data+len), "r"(data+len-j) > - NAMED_CONSTRAINTS_ARRAY_ADD(pd_1) > - :"memory" > - ); > - } else { > - __asm__ volatile( > - "movsd "MANGLE(pd_1)", %%xmm0 \n\t" > - "movsd "MANGLE(pd_1)", %%xmm1 \n\t" > - "1: \n\t" > - "movapd (%3,%0), %%xmm3 \n\t" > - "movupd -8(%4,%0), %%xmm4 \n\t" > - "mulpd %%xmm3, %%xmm4 \n\t" > - "mulpd (%4,%0), %%xmm3 \n\t" > - "addpd %%xmm4, %%xmm1 \n\t" > - "addpd %%xmm3, %%xmm0 \n\t" > - "add $16, %0 \n\t" > - "jl 1b \n\t" > - "movhlps %%xmm0, %%xmm3 \n\t" > - "movhlps %%xmm1, %%xmm4 \n\t" > - "addsd %%xmm3, %%xmm0 \n\t" > - "addsd %%xmm4, %%xmm1 \n\t" > - "movsd %%xmm0, %1 \n\t" > - "movsd %%xmm1, %2 \n\t" > - :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1]) > - :"r"(data+len), "r"(data+len-j) > - NAMED_CONSTRAINTS_ARRAY_ADD(pd_1) > - ); > - } > - } > -} > - > -#endif /* HAVE_SSE2_INLINE */ > +void ff_lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, int lag, > + double *autoc); > +void ff_lpc_compute_autocorr_avx(const double *data, ptrdiff_t len, int lag, > + double *autoc); > > av_cold void ff_lpc_init_x86(LPCContext *c) > { > int cpu_flags = av_get_cpu_flags(); > > -#if HAVE_SSE2_INLINE > - if (INLINE_SSE2_SLOW(cpu_flags)) > - c->lpc_compute_autocorr = lpc_compute_autocorr_sse2; > -#endif > + if (EXTERNAL_SSE2(cpu_flags)) > + c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2; Place this with ff_lpc_apply_welch_window_sse2 below. > + > + if (EXTERNAL_AVX_FAST(cpu_flags)) > + c->lpc_compute_autocorr = ff_lpc_compute_autocorr_avx; > > if (EXTERNAL_SSE2(cpu_flags)) > c->lpc_apply_welch_window = ff_lpc_apply_welch_window_sse2;
On 5/25/2024 10:51 PM, James Almer wrote: > On 5/25/2024 10:42 PM, Lynne via ffmpeg-devel wrote: >> The inline asm function had issues running under checkasm. >> So I came to finish what I started, and wrote the last part >> of LPC computation in assembly. >> --- >> libavcodec/x86/lpc.asm | 91 +++++++++++++++++++++++++++++++++++++++ >> libavcodec/x86/lpc_init.c | 87 ++++--------------------------------- >> 2 files changed, 100 insertions(+), 78 deletions(-) >> >> diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm >> index a585c17ef5..9c359ae480 100644 >> --- a/libavcodec/x86/lpc.asm >> +++ b/libavcodec/x86/lpc.asm >> @@ -261,3 +261,94 @@ APPLY_WELCH_FN >> INIT_YMM avx2 >> APPLY_WELCH_FN >> %endif >> + >> +%macro COMPUTE_AUTOCORR_FN 0 >> +cglobal lpc_compute_autocorr, 4, 7, 3, data, len, lag, autoc, lag_p, >> data_l, len_p >> + shl lagd, 3 >> + shl lenq, 3 >> + xor lag_pq, lag_pq >> + >> +.lag_l: >> + movaps m2, [one_tab] > > Super nit: movapd > >> + >> + mov len_pq, lag_pq >> + >> + lea data_lq, [lag_pq + mmsize - 8] >> + neg data_lq ; -j - mmsize >> + add data_lq, dataq ; data[-j - mmsize] >> +.len_l: >> + >> +%if mmsize == 32 >> + vbroadcastsd m0, [dataq + len_pq] >> + vpermpd m1, [data_lq + len_pq], q0123 >> +%else >> + movupd m1, [data_lq + len_pq] ; data[i - j] >> + movsd xm0, [dataq + len_pq] ; data[i] >> + shufpd m1, m1, m1, 01b I just realized you're shuffling the values inside the len_1 loop when you could do it right before you store the sum. Something like: [...] .len_l: %if mmsize == 16 movsd m0, [dataq + len_pq] ; data[i] shufpd m0, m0, m0, 0 movupd m1, [data_lq + len_pq] ; data[i - j] mulpd m0, m1 %else vbroadcastsd m0, [dataq + len_pq] mulpd m0, [data_lq + len_pq] ; data[i - j] %endif addpd m2, m0 ; sum += data[i]*data[i-j] add len_pq, 8 cmp len_pq, lenq jl .len_l shufpd m2, m2, m2, 0101b %if mmsize == 32 vextractf128 [autocq + lag_pq], m2, 1 movupd [autocq + lag_pq + 16], xm2 ; autoc[j] = sum %else movupd [autocq + lag_pq], m2 ; autoc[j] = sum %endif add lag_pq, mmsize cmp lag_pq, lagq jl .lag_l [...] And by using vextractf128 here instead of vpermpd you can keep the function as avx instead of avx2, unless a vpermpd + single 256bit store is faster than shufpd + two stores (vextractf128 + movu 128bit), which i assume it wont because of crosslane shuffling. >> +%endif >> + >> + shufpd m0, m0, m0, 1100b > > This is not needed for mmsize == 32. The broadcast set every qword to > the value movsd loaded. > >> + >> + ; fmadd actually hurts performance in this case due to >> + ; the earlier loads + shuffles >> + mulpd m0, m1 >> + addpd m2, m0 ; sum += data[i]*data[i-j] >> + >> + add len_pq, 8 >> + cmp len_pq, lenq >> + jl .len_l >> + >> + movupd [autocq + lag_pq], m2 ; autoc[j] = sum >> + add lag_pq, mmsize >> + cmp lag_pq, lagq >> + jl .lag_l >> + >> + ; The tail computation is guaranteed never to happen >> + ; as long as we're doing multiples of 4, rather than 2. >> +%if mmsize != 32 >> + jg .end >> + ; If lag_p == lag fallthrough >> + >> +.tail: >> + movaps m2, [one_tab] >> + >> + mov len_pq, lag_pq >> + sub len_pq, mmsize >> + >> + lea data_lq, [lag_pq] >> + neg data_lq ; -j >> + add data_lq, dataq ; data[-j] >> + >> +.tail_l: >> + movupd m0, [dataq + len_pq] >> + movupd m1, [data_lq + len_pq] >> + >> + mulpd m0, m1 >> + addpd m2, m0 ; sum += data[i]*data[i-j] >> + >> + add len_pq, mmsize >> + cmp len_pq, lenq >> + jl .tail_l >> + >> + shufpd m1, m2, m2, 01b >> + addpd m2, m1 >> + >> + ; Leave this here just in case its ever needed >> +%if mmsize == 32 >> + vperm2f128 m1, m2, m2, 0x01 >> + addpd xm2, xm1 >> + movupd [autocq + lag_pq], xm2 >> +%else >> + movhpd [autocq + lag_pq], xm2 >> +%endif >> + >> +.end: >> +%endif >> + >> + RET >> +%endmacro >> + >> +INIT_XMM sse2 >> +COMPUTE_AUTOCORR_FN >> +INIT_YMM avx > > vpermpd is avx2, so it needs to be that. > >> +COMPUTE_AUTOCORR_FN >> diff --git a/libavcodec/x86/lpc_init.c b/libavcodec/x86/lpc_init.c >> index f2fca53799..bb174be53e 100644 >> --- a/libavcodec/x86/lpc_init.c >> +++ b/libavcodec/x86/lpc_init.c >> @@ -28,89 +28,20 @@ void ff_lpc_apply_welch_window_sse2(const int32_t >> *data, ptrdiff_t len, >> double *w_data); >> void ff_lpc_apply_welch_window_avx2(const int32_t *data, ptrdiff_t len, >> double *w_data); >> - >> -DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 }; >> - >> -#if HAVE_SSE2_INLINE >> - >> -static void lpc_compute_autocorr_sse2(const double *data, ptrdiff_t >> len, int lag, >> - double *autoc) >> -{ >> - int j; >> - >> - if((x86_reg)data & 15) >> - data++; >> - >> - for(j=0; j<lag; j+=2){ >> - x86_reg i = -len*sizeof(double); >> - if(j == lag-2) { >> - __asm__ volatile( >> - "movsd "MANGLE(pd_1)", %%xmm0 \n\t" >> - "movsd "MANGLE(pd_1)", %%xmm1 \n\t" >> - "movsd "MANGLE(pd_1)", %%xmm2 \n\t" >> - "1: \n\t" >> - "movapd (%2,%0), %%xmm3 \n\t" >> - "movupd -8(%3,%0), %%xmm4 \n\t" >> - "movapd (%3,%0), %%xmm5 \n\t" >> - "mulpd %%xmm3, %%xmm4 \n\t" >> - "mulpd %%xmm3, %%xmm5 \n\t" >> - "mulpd -16(%3,%0), %%xmm3 \n\t" >> - "addpd %%xmm4, %%xmm1 \n\t" >> - "addpd %%xmm5, %%xmm0 \n\t" >> - "addpd %%xmm3, %%xmm2 \n\t" >> - "add $16, %0 \n\t" >> - "jl 1b \n\t" >> - "movhlps %%xmm0, %%xmm3 \n\t" >> - "movhlps %%xmm1, %%xmm4 \n\t" >> - "movhlps %%xmm2, %%xmm5 \n\t" >> - "addsd %%xmm3, %%xmm0 \n\t" >> - "addsd %%xmm4, %%xmm1 \n\t" >> - "addsd %%xmm5, %%xmm2 \n\t" >> - "movsd %%xmm0, (%1) \n\t" >> - "movsd %%xmm1, 8(%1) \n\t" >> - "movsd %%xmm2, 16(%1) \n\t" >> - :"+&r"(i) >> - :"r"(autoc+j), "r"(data+len), "r"(data+len-j) >> - NAMED_CONSTRAINTS_ARRAY_ADD(pd_1) >> - :"memory" >> - ); >> - } else { >> - __asm__ volatile( >> - "movsd "MANGLE(pd_1)", %%xmm0 \n\t" >> - "movsd "MANGLE(pd_1)", %%xmm1 \n\t" >> - "1: \n\t" >> - "movapd (%3,%0), %%xmm3 \n\t" >> - "movupd -8(%4,%0), %%xmm4 \n\t" >> - "mulpd %%xmm3, %%xmm4 \n\t" >> - "mulpd (%4,%0), %%xmm3 \n\t" >> - "addpd %%xmm4, %%xmm1 \n\t" >> - "addpd %%xmm3, %%xmm0 \n\t" >> - "add $16, %0 \n\t" >> - "jl 1b \n\t" >> - "movhlps %%xmm0, %%xmm3 \n\t" >> - "movhlps %%xmm1, %%xmm4 \n\t" >> - "addsd %%xmm3, %%xmm0 \n\t" >> - "addsd %%xmm4, %%xmm1 \n\t" >> - "movsd %%xmm0, %1 \n\t" >> - "movsd %%xmm1, %2 \n\t" >> - :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1]) >> - :"r"(data+len), "r"(data+len-j) >> - NAMED_CONSTRAINTS_ARRAY_ADD(pd_1) >> - ); >> - } >> - } >> -} >> - >> -#endif /* HAVE_SSE2_INLINE */ >> +void ff_lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, >> int lag, >> + double *autoc); >> +void ff_lpc_compute_autocorr_avx(const double *data, ptrdiff_t len, >> int lag, >> + double *autoc); >> av_cold void ff_lpc_init_x86(LPCContext *c) >> { >> int cpu_flags = av_get_cpu_flags(); >> -#if HAVE_SSE2_INLINE >> - if (INLINE_SSE2_SLOW(cpu_flags)) >> - c->lpc_compute_autocorr = lpc_compute_autocorr_sse2; >> -#endif >> + if (EXTERNAL_SSE2(cpu_flags)) >> + c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2; > > Place this with ff_lpc_apply_welch_window_sse2 below. > >> + >> + if (EXTERNAL_AVX_FAST(cpu_flags)) >> + c->lpc_compute_autocorr = ff_lpc_compute_autocorr_avx; >> if (EXTERNAL_SSE2(cpu_flags)) >> c->lpc_apply_welch_window = ff_lpc_apply_welch_window_sse2;
On Sun, May 26, 2024 at 03:42:01AM +0200, Lynne via ffmpeg-devel wrote: > The inline asm function had issues running under checkasm. > So I came to finish what I started, and wrote the last part > of LPC computation in assembly. > --- > libavcodec/x86/lpc.asm | 91 +++++++++++++++++++++++++++++++++++++++ > libavcodec/x86/lpc_init.c | 87 ++++--------------------------------- > 2 files changed, 100 insertions(+), 78 deletions(-) seems to break fate make: *** [tests/Makefile:311: fate-lavf-ogg] Error 1 make: *** [tests/Makefile:311: fate-iamf-stereo] Error 1 make: *** [tests/Makefile:311: fate-mov-mp4-iamf-stereo] Error 1 make: *** [tests/Makefile:311: fate-iamf-ambisonic_1] Error 1 make: *** [tests/Makefile:310: fate-mov-mp4-iamf-ambisonic_1] Error 1 make: *** [tests/Makefile:311: fate-mov-mp4-iamf-5_1_4] Error 1 make: *** [tests/Makefile:311: fate-iamf-5_1_4] Error 1 make: *** [tests/Makefile:311: fate-iamf-7_1_4] Error 1 make: *** [tests/Makefile:311: fate-mov-mp4-iamf-7_1_4] Error 1 make: *** [tests/Makefile:311: fate-cover-art-flac-remux] Error 1 thx [...]
diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm index a585c17ef5..9c359ae480 100644 --- a/libavcodec/x86/lpc.asm +++ b/libavcodec/x86/lpc.asm @@ -261,3 +261,94 @@ APPLY_WELCH_FN INIT_YMM avx2 APPLY_WELCH_FN %endif + +%macro COMPUTE_AUTOCORR_FN 0 +cglobal lpc_compute_autocorr, 4, 7, 3, data, len, lag, autoc, lag_p, data_l, len_p + shl lagd, 3 + shl lenq, 3 + xor lag_pq, lag_pq + +.lag_l: + movaps m2, [one_tab] + + mov len_pq, lag_pq + + lea data_lq, [lag_pq + mmsize - 8] + neg data_lq ; -j - mmsize + add data_lq, dataq ; data[-j - mmsize] +.len_l: + +%if mmsize == 32 + vbroadcastsd m0, [dataq + len_pq] + vpermpd m1, [data_lq + len_pq], q0123 +%else + movupd m1, [data_lq + len_pq] ; data[i - j] + movsd xm0, [dataq + len_pq] ; data[i] + shufpd m1, m1, m1, 01b +%endif + + shufpd m0, m0, m0, 1100b + + ; fmadd actually hurts performance in this case due to + ; the earlier loads + shuffles + mulpd m0, m1 + addpd m2, m0 ; sum += data[i]*data[i-j] + + add len_pq, 8 + cmp len_pq, lenq + jl .len_l + + movupd [autocq + lag_pq], m2 ; autoc[j] = sum + add lag_pq, mmsize + cmp lag_pq, lagq + jl .lag_l + + ; The tail computation is guaranteed never to happen + ; as long as we're doing multiples of 4, rather than 2. +%if mmsize != 32 + jg .end + ; If lag_p == lag fallthrough + +.tail: + movaps m2, [one_tab] + + mov len_pq, lag_pq + sub len_pq, mmsize + + lea data_lq, [lag_pq] + neg data_lq ; -j + add data_lq, dataq ; data[-j] + +.tail_l: + movupd m0, [dataq + len_pq] + movupd m1, [data_lq + len_pq] + + mulpd m0, m1 + addpd m2, m0 ; sum += data[i]*data[i-j] + + add len_pq, mmsize + cmp len_pq, lenq + jl .tail_l + + shufpd m1, m2, m2, 01b + addpd m2, m1 + + ; Leave this here just in case its ever needed +%if mmsize == 32 + vperm2f128 m1, m2, m2, 0x01 + addpd xm2, xm1 + movupd [autocq + lag_pq], xm2 +%else + movhpd [autocq + lag_pq], xm2 +%endif + +.end: +%endif + + RET +%endmacro + +INIT_XMM sse2 +COMPUTE_AUTOCORR_FN +INIT_YMM avx +COMPUTE_AUTOCORR_FN diff --git a/libavcodec/x86/lpc_init.c b/libavcodec/x86/lpc_init.c index f2fca53799..bb174be53e 100644 --- a/libavcodec/x86/lpc_init.c +++ b/libavcodec/x86/lpc_init.c @@ -28,89 +28,20 @@ void ff_lpc_apply_welch_window_sse2(const int32_t *data, ptrdiff_t len, double *w_data); void ff_lpc_apply_welch_window_avx2(const int32_t *data, ptrdiff_t len, double *w_data); - -DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 }; - -#if HAVE_SSE2_INLINE - -static void lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, int lag, - double *autoc) -{ - int j; - - if((x86_reg)data & 15) - data++; - - for(j=0; j<lag; j+=2){ - x86_reg i = -len*sizeof(double); - if(j == lag-2) { - __asm__ volatile( - "movsd "MANGLE(pd_1)", %%xmm0 \n\t" - "movsd "MANGLE(pd_1)", %%xmm1 \n\t" - "movsd "MANGLE(pd_1)", %%xmm2 \n\t" - "1: \n\t" - "movapd (%2,%0), %%xmm3 \n\t" - "movupd -8(%3,%0), %%xmm4 \n\t" - "movapd (%3,%0), %%xmm5 \n\t" - "mulpd %%xmm3, %%xmm4 \n\t" - "mulpd %%xmm3, %%xmm5 \n\t" - "mulpd -16(%3,%0), %%xmm3 \n\t" - "addpd %%xmm4, %%xmm1 \n\t" - "addpd %%xmm5, %%xmm0 \n\t" - "addpd %%xmm3, %%xmm2 \n\t" - "add $16, %0 \n\t" - "jl 1b \n\t" - "movhlps %%xmm0, %%xmm3 \n\t" - "movhlps %%xmm1, %%xmm4 \n\t" - "movhlps %%xmm2, %%xmm5 \n\t" - "addsd %%xmm3, %%xmm0 \n\t" - "addsd %%xmm4, %%xmm1 \n\t" - "addsd %%xmm5, %%xmm2 \n\t" - "movsd %%xmm0, (%1) \n\t" - "movsd %%xmm1, 8(%1) \n\t" - "movsd %%xmm2, 16(%1) \n\t" - :"+&r"(i) - :"r"(autoc+j), "r"(data+len), "r"(data+len-j) - NAMED_CONSTRAINTS_ARRAY_ADD(pd_1) - :"memory" - ); - } else { - __asm__ volatile( - "movsd "MANGLE(pd_1)", %%xmm0 \n\t" - "movsd "MANGLE(pd_1)", %%xmm1 \n\t" - "1: \n\t" - "movapd (%3,%0), %%xmm3 \n\t" - "movupd -8(%4,%0), %%xmm4 \n\t" - "mulpd %%xmm3, %%xmm4 \n\t" - "mulpd (%4,%0), %%xmm3 \n\t" - "addpd %%xmm4, %%xmm1 \n\t" - "addpd %%xmm3, %%xmm0 \n\t" - "add $16, %0 \n\t" - "jl 1b \n\t" - "movhlps %%xmm0, %%xmm3 \n\t" - "movhlps %%xmm1, %%xmm4 \n\t" - "addsd %%xmm3, %%xmm0 \n\t" - "addsd %%xmm4, %%xmm1 \n\t" - "movsd %%xmm0, %1 \n\t" - "movsd %%xmm1, %2 \n\t" - :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1]) - :"r"(data+len), "r"(data+len-j) - NAMED_CONSTRAINTS_ARRAY_ADD(pd_1) - ); - } - } -} - -#endif /* HAVE_SSE2_INLINE */ +void ff_lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, int lag, + double *autoc); +void ff_lpc_compute_autocorr_avx(const double *data, ptrdiff_t len, int lag, + double *autoc); av_cold void ff_lpc_init_x86(LPCContext *c) { int cpu_flags = av_get_cpu_flags(); -#if HAVE_SSE2_INLINE - if (INLINE_SSE2_SLOW(cpu_flags)) - c->lpc_compute_autocorr = lpc_compute_autocorr_sse2; -#endif + if (EXTERNAL_SSE2(cpu_flags)) + c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2; + + if (EXTERNAL_AVX_FAST(cpu_flags)) + c->lpc_compute_autocorr = ff_lpc_compute_autocorr_avx; if (EXTERNAL_SSE2(cpu_flags)) c->lpc_apply_welch_window = ff_lpc_apply_welch_window_sse2;