diff mbox series

[FFmpeg-devel,v2] lpc: rewrite lpc_compute_autocorr in external asm

Message ID 20240526014207.2697057-1-dev@lynne.ee
State New
Headers show
Series [FFmpeg-devel,v2] lpc: rewrite lpc_compute_autocorr in external asm | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Lynne May 26, 2024, 1:42 a.m. UTC
The inline asm function had issues running under checkasm.
So I came to finish what I started, and wrote the last part
of LPC computation in assembly.
---
 libavcodec/x86/lpc.asm    | 91 +++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/lpc_init.c | 87 ++++---------------------------------
 2 files changed, 100 insertions(+), 78 deletions(-)

Comments

James Almer May 26, 2024, 1:51 a.m. UTC | #1
On 5/25/2024 10:42 PM, Lynne via ffmpeg-devel wrote:
> The inline asm function had issues running under checkasm.
> So I came to finish what I started, and wrote the last part
> of LPC computation in assembly.
> ---
>   libavcodec/x86/lpc.asm    | 91 +++++++++++++++++++++++++++++++++++++++
>   libavcodec/x86/lpc_init.c | 87 ++++---------------------------------
>   2 files changed, 100 insertions(+), 78 deletions(-)
> 
> diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm
> index a585c17ef5..9c359ae480 100644
> --- a/libavcodec/x86/lpc.asm
> +++ b/libavcodec/x86/lpc.asm
> @@ -261,3 +261,94 @@ APPLY_WELCH_FN
>   INIT_YMM avx2
>   APPLY_WELCH_FN
>   %endif
> +
> +%macro COMPUTE_AUTOCORR_FN 0
> +cglobal lpc_compute_autocorr, 4, 7, 3, data, len, lag, autoc, lag_p, data_l, len_p
> +    shl lagd, 3
> +    shl lenq, 3
> +    xor lag_pq, lag_pq
> +
> +.lag_l:
> +    movaps m2, [one_tab]

Super nit: movapd

> +
> +    mov len_pq, lag_pq
> +
> +    lea data_lq, [lag_pq + mmsize - 8]
> +    neg data_lq                     ; -j - mmsize
> +    add data_lq, dataq              ; data[-j - mmsize]
> +.len_l:
> +
> +%if mmsize == 32
> +    vbroadcastsd m0, [dataq + len_pq]
> +    vpermpd m1, [data_lq + len_pq], q0123
> +%else
> +    movupd m1, [data_lq + len_pq]   ; data[i - j]
> +    movsd xm0, [dataq + len_pq]     ; data[i]
> +    shufpd m1, m1, m1, 01b
> +%endif
> +
> +    shufpd m0, m0, m0, 1100b

This is not needed for mmsize == 32. The broadcast set every qword to 
the value movsd loaded.

> +
> +    ; fmadd actually hurts performance in this case due to
> +    ; the earlier loads + shuffles
> +    mulpd m0, m1
> +    addpd m2, m0                    ; sum += data[i]*data[i-j]
> +
> +    add len_pq, 8
> +    cmp len_pq, lenq
> +    jl .len_l
> +
> +    movupd [autocq + lag_pq], m2    ; autoc[j] = sum
> +    add lag_pq, mmsize
> +    cmp lag_pq, lagq
> +    jl .lag_l
> +
> +    ; The tail computation is guaranteed never to happen
> +    ; as long as we're doing multiples of 4, rather than 2.
> +%if mmsize != 32
> +    jg .end
> +    ; If lag_p == lag fallthrough
> +
> +.tail:
> +    movaps m2, [one_tab]
> +
> +    mov len_pq, lag_pq
> +    sub len_pq, mmsize
> +
> +    lea data_lq, [lag_pq]
> +    neg data_lq                     ; -j
> +    add data_lq, dataq              ; data[-j]
> +
> +.tail_l:
> +    movupd m0, [dataq + len_pq]
> +    movupd m1, [data_lq + len_pq]
> +
> +    mulpd m0, m1
> +    addpd m2, m0                    ; sum += data[i]*data[i-j]
> +
> +    add len_pq, mmsize
> +    cmp len_pq, lenq
> +    jl .tail_l
> +
> +    shufpd m1, m2, m2, 01b
> +    addpd m2, m1
> +
> +    ; Leave this here just in case its ever needed
> +%if mmsize == 32
> +    vperm2f128 m1, m2, m2, 0x01
> +    addpd xm2, xm1
> +    movupd [autocq + lag_pq], xm2
> +%else
> +    movhpd [autocq + lag_pq], xm2
> +%endif
> +
> +.end:
> +%endif
> +
> +    RET
> +%endmacro
> +
> +INIT_XMM sse2
> +COMPUTE_AUTOCORR_FN
> +INIT_YMM avx

vpermpd is avx2, so it needs to be that.

> +COMPUTE_AUTOCORR_FN
> diff --git a/libavcodec/x86/lpc_init.c b/libavcodec/x86/lpc_init.c
> index f2fca53799..bb174be53e 100644
> --- a/libavcodec/x86/lpc_init.c
> +++ b/libavcodec/x86/lpc_init.c
> @@ -28,89 +28,20 @@ void ff_lpc_apply_welch_window_sse2(const int32_t *data, ptrdiff_t len,
>                                       double *w_data);
>   void ff_lpc_apply_welch_window_avx2(const int32_t *data, ptrdiff_t len,
>                                       double *w_data);
> -
> -DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 };
> -
> -#if HAVE_SSE2_INLINE
> -
> -static void lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, int lag,
> -                                      double *autoc)
> -{
> -    int j;
> -
> -    if((x86_reg)data & 15)
> -        data++;
> -
> -    for(j=0; j<lag; j+=2){
> -        x86_reg i = -len*sizeof(double);
> -        if(j == lag-2) {
> -            __asm__ volatile(
> -                "movsd    "MANGLE(pd_1)", %%xmm0    \n\t"
> -                "movsd    "MANGLE(pd_1)", %%xmm1    \n\t"
> -                "movsd    "MANGLE(pd_1)", %%xmm2    \n\t"
> -                "1:                                 \n\t"
> -                "movapd   (%2,%0), %%xmm3           \n\t"
> -                "movupd -8(%3,%0), %%xmm4           \n\t"
> -                "movapd   (%3,%0), %%xmm5           \n\t"
> -                "mulpd     %%xmm3, %%xmm4           \n\t"
> -                "mulpd     %%xmm3, %%xmm5           \n\t"
> -                "mulpd -16(%3,%0), %%xmm3           \n\t"
> -                "addpd     %%xmm4, %%xmm1           \n\t"
> -                "addpd     %%xmm5, %%xmm0           \n\t"
> -                "addpd     %%xmm3, %%xmm2           \n\t"
> -                "add       $16,    %0               \n\t"
> -                "jl 1b                              \n\t"
> -                "movhlps   %%xmm0, %%xmm3           \n\t"
> -                "movhlps   %%xmm1, %%xmm4           \n\t"
> -                "movhlps   %%xmm2, %%xmm5           \n\t"
> -                "addsd     %%xmm3, %%xmm0           \n\t"
> -                "addsd     %%xmm4, %%xmm1           \n\t"
> -                "addsd     %%xmm5, %%xmm2           \n\t"
> -                "movsd     %%xmm0,   (%1)           \n\t"
> -                "movsd     %%xmm1,  8(%1)           \n\t"
> -                "movsd     %%xmm2, 16(%1)           \n\t"
> -                :"+&r"(i)
> -                :"r"(autoc+j), "r"(data+len), "r"(data+len-j)
> -                 NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
> -                :"memory"
> -            );
> -        } else {
> -            __asm__ volatile(
> -                "movsd    "MANGLE(pd_1)", %%xmm0    \n\t"
> -                "movsd    "MANGLE(pd_1)", %%xmm1    \n\t"
> -                "1:                                 \n\t"
> -                "movapd   (%3,%0), %%xmm3           \n\t"
> -                "movupd -8(%4,%0), %%xmm4           \n\t"
> -                "mulpd     %%xmm3, %%xmm4           \n\t"
> -                "mulpd    (%4,%0), %%xmm3           \n\t"
> -                "addpd     %%xmm4, %%xmm1           \n\t"
> -                "addpd     %%xmm3, %%xmm0           \n\t"
> -                "add       $16,    %0               \n\t"
> -                "jl 1b                              \n\t"
> -                "movhlps   %%xmm0, %%xmm3           \n\t"
> -                "movhlps   %%xmm1, %%xmm4           \n\t"
> -                "addsd     %%xmm3, %%xmm0           \n\t"
> -                "addsd     %%xmm4, %%xmm1           \n\t"
> -                "movsd     %%xmm0, %1               \n\t"
> -                "movsd     %%xmm1, %2               \n\t"
> -                :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
> -                :"r"(data+len), "r"(data+len-j)
> -                 NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
> -            );
> -        }
> -    }
> -}
> -
> -#endif /* HAVE_SSE2_INLINE */
> +void ff_lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, int lag,
> +                                  double *autoc);
> +void ff_lpc_compute_autocorr_avx(const double *data, ptrdiff_t len, int lag,
> +                                 double *autoc);
>   
>   av_cold void ff_lpc_init_x86(LPCContext *c)
>   {
>       int cpu_flags = av_get_cpu_flags();
>   
> -#if HAVE_SSE2_INLINE
> -    if (INLINE_SSE2_SLOW(cpu_flags))
> -        c->lpc_compute_autocorr = lpc_compute_autocorr_sse2;
> -#endif
> +    if (EXTERNAL_SSE2(cpu_flags))
> +        c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2;

Place this with ff_lpc_apply_welch_window_sse2 below.

> +
> +    if (EXTERNAL_AVX_FAST(cpu_flags))
> +        c->lpc_compute_autocorr = ff_lpc_compute_autocorr_avx;
>   
>       if (EXTERNAL_SSE2(cpu_flags))
>           c->lpc_apply_welch_window = ff_lpc_apply_welch_window_sse2;
James Almer May 26, 2024, 2:16 a.m. UTC | #2
On 5/25/2024 10:51 PM, James Almer wrote:
> On 5/25/2024 10:42 PM, Lynne via ffmpeg-devel wrote:
>> The inline asm function had issues running under checkasm.
>> So I came to finish what I started, and wrote the last part
>> of LPC computation in assembly.
>> ---
>>   libavcodec/x86/lpc.asm    | 91 +++++++++++++++++++++++++++++++++++++++
>>   libavcodec/x86/lpc_init.c | 87 ++++---------------------------------
>>   2 files changed, 100 insertions(+), 78 deletions(-)
>>
>> diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm
>> index a585c17ef5..9c359ae480 100644
>> --- a/libavcodec/x86/lpc.asm
>> +++ b/libavcodec/x86/lpc.asm
>> @@ -261,3 +261,94 @@ APPLY_WELCH_FN
>>   INIT_YMM avx2
>>   APPLY_WELCH_FN
>>   %endif
>> +
>> +%macro COMPUTE_AUTOCORR_FN 0
>> +cglobal lpc_compute_autocorr, 4, 7, 3, data, len, lag, autoc, lag_p, 
>> data_l, len_p
>> +    shl lagd, 3
>> +    shl lenq, 3
>> +    xor lag_pq, lag_pq
>> +
>> +.lag_l:
>> +    movaps m2, [one_tab]
> 
> Super nit: movapd
> 
>> +
>> +    mov len_pq, lag_pq
>> +
>> +    lea data_lq, [lag_pq + mmsize - 8]
>> +    neg data_lq                     ; -j - mmsize
>> +    add data_lq, dataq              ; data[-j - mmsize]
>> +.len_l:
>> +
>> +%if mmsize == 32
>> +    vbroadcastsd m0, [dataq + len_pq]
>> +    vpermpd m1, [data_lq + len_pq], q0123
>> +%else
>> +    movupd m1, [data_lq + len_pq]   ; data[i - j]
>> +    movsd xm0, [dataq + len_pq]     ; data[i]
>> +    shufpd m1, m1, m1, 01b

I just realized you're shuffling the values inside the len_1 loop when 
you could do it right before you store the sum.

Something like:

[...]
.len_l:
%if mmsize == 16
     movsd  m0, [dataq + len_pq]     ; data[i]
     shufpd m0, m0, m0, 0
     movupd m1, [data_lq + len_pq]   ; data[i - j]

     mulpd m0, m1
%else
     vbroadcastsd m0, [dataq + len_pq]
     mulpd m0, [data_lq + len_pq]   ; data[i - j]
%endif

     addpd m2, m0                    ; sum += data[i]*data[i-j]

     add len_pq, 8
     cmp len_pq, lenq
     jl .len_l

     shufpd m2, m2, m2, 0101b
%if mmsize == 32
     vextractf128 [autocq + lag_pq], m2, 1
     movupd [autocq + lag_pq + 16], xm2 ; autoc[j] = sum
%else
     movupd [autocq + lag_pq], m2       ; autoc[j] = sum
%endif
     add lag_pq, mmsize
     cmp lag_pq, lagq
     jl .lag_l
[...]

And by using vextractf128 here instead of vpermpd you can keep the 
function as avx instead of avx2, unless a vpermpd + single 256bit store 
is faster than shufpd + two stores (vextractf128 + movu 128bit), which i 
assume it wont because of crosslane shuffling.

>> +%endif
>> +
>> +    shufpd m0, m0, m0, 1100b
> 
> This is not needed for mmsize == 32. The broadcast set every qword to 
> the value movsd loaded.
> 
>> +
>> +    ; fmadd actually hurts performance in this case due to
>> +    ; the earlier loads + shuffles
>> +    mulpd m0, m1
>> +    addpd m2, m0                    ; sum += data[i]*data[i-j]
>> +
>> +    add len_pq, 8
>> +    cmp len_pq, lenq
>> +    jl .len_l
>> +
>> +    movupd [autocq + lag_pq], m2    ; autoc[j] = sum
>> +    add lag_pq, mmsize
>> +    cmp lag_pq, lagq
>> +    jl .lag_l
>> +
>> +    ; The tail computation is guaranteed never to happen
>> +    ; as long as we're doing multiples of 4, rather than 2.
>> +%if mmsize != 32
>> +    jg .end
>> +    ; If lag_p == lag fallthrough
>> +
>> +.tail:
>> +    movaps m2, [one_tab]
>> +
>> +    mov len_pq, lag_pq
>> +    sub len_pq, mmsize
>> +
>> +    lea data_lq, [lag_pq]
>> +    neg data_lq                     ; -j
>> +    add data_lq, dataq              ; data[-j]
>> +
>> +.tail_l:
>> +    movupd m0, [dataq + len_pq]
>> +    movupd m1, [data_lq + len_pq]
>> +
>> +    mulpd m0, m1
>> +    addpd m2, m0                    ; sum += data[i]*data[i-j]
>> +
>> +    add len_pq, mmsize
>> +    cmp len_pq, lenq
>> +    jl .tail_l
>> +
>> +    shufpd m1, m2, m2, 01b
>> +    addpd m2, m1
>> +
>> +    ; Leave this here just in case its ever needed
>> +%if mmsize == 32
>> +    vperm2f128 m1, m2, m2, 0x01
>> +    addpd xm2, xm1
>> +    movupd [autocq + lag_pq], xm2
>> +%else
>> +    movhpd [autocq + lag_pq], xm2
>> +%endif
>> +
>> +.end:
>> +%endif
>> +
>> +    RET
>> +%endmacro
>> +
>> +INIT_XMM sse2
>> +COMPUTE_AUTOCORR_FN
>> +INIT_YMM avx
> 
> vpermpd is avx2, so it needs to be that.
> 
>> +COMPUTE_AUTOCORR_FN
>> diff --git a/libavcodec/x86/lpc_init.c b/libavcodec/x86/lpc_init.c
>> index f2fca53799..bb174be53e 100644
>> --- a/libavcodec/x86/lpc_init.c
>> +++ b/libavcodec/x86/lpc_init.c
>> @@ -28,89 +28,20 @@ void ff_lpc_apply_welch_window_sse2(const int32_t 
>> *data, ptrdiff_t len,
>>                                       double *w_data);
>>   void ff_lpc_apply_welch_window_avx2(const int32_t *data, ptrdiff_t len,
>>                                       double *w_data);
>> -
>> -DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 };
>> -
>> -#if HAVE_SSE2_INLINE
>> -
>> -static void lpc_compute_autocorr_sse2(const double *data, ptrdiff_t 
>> len, int lag,
>> -                                      double *autoc)
>> -{
>> -    int j;
>> -
>> -    if((x86_reg)data & 15)
>> -        data++;
>> -
>> -    for(j=0; j<lag; j+=2){
>> -        x86_reg i = -len*sizeof(double);
>> -        if(j == lag-2) {
>> -            __asm__ volatile(
>> -                "movsd    "MANGLE(pd_1)", %%xmm0    \n\t"
>> -                "movsd    "MANGLE(pd_1)", %%xmm1    \n\t"
>> -                "movsd    "MANGLE(pd_1)", %%xmm2    \n\t"
>> -                "1:                                 \n\t"
>> -                "movapd   (%2,%0), %%xmm3           \n\t"
>> -                "movupd -8(%3,%0), %%xmm4           \n\t"
>> -                "movapd   (%3,%0), %%xmm5           \n\t"
>> -                "mulpd     %%xmm3, %%xmm4           \n\t"
>> -                "mulpd     %%xmm3, %%xmm5           \n\t"
>> -                "mulpd -16(%3,%0), %%xmm3           \n\t"
>> -                "addpd     %%xmm4, %%xmm1           \n\t"
>> -                "addpd     %%xmm5, %%xmm0           \n\t"
>> -                "addpd     %%xmm3, %%xmm2           \n\t"
>> -                "add       $16,    %0               \n\t"
>> -                "jl 1b                              \n\t"
>> -                "movhlps   %%xmm0, %%xmm3           \n\t"
>> -                "movhlps   %%xmm1, %%xmm4           \n\t"
>> -                "movhlps   %%xmm2, %%xmm5           \n\t"
>> -                "addsd     %%xmm3, %%xmm0           \n\t"
>> -                "addsd     %%xmm4, %%xmm1           \n\t"
>> -                "addsd     %%xmm5, %%xmm2           \n\t"
>> -                "movsd     %%xmm0,   (%1)           \n\t"
>> -                "movsd     %%xmm1,  8(%1)           \n\t"
>> -                "movsd     %%xmm2, 16(%1)           \n\t"
>> -                :"+&r"(i)
>> -                :"r"(autoc+j), "r"(data+len), "r"(data+len-j)
>> -                 NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
>> -                :"memory"
>> -            );
>> -        } else {
>> -            __asm__ volatile(
>> -                "movsd    "MANGLE(pd_1)", %%xmm0    \n\t"
>> -                "movsd    "MANGLE(pd_1)", %%xmm1    \n\t"
>> -                "1:                                 \n\t"
>> -                "movapd   (%3,%0), %%xmm3           \n\t"
>> -                "movupd -8(%4,%0), %%xmm4           \n\t"
>> -                "mulpd     %%xmm3, %%xmm4           \n\t"
>> -                "mulpd    (%4,%0), %%xmm3           \n\t"
>> -                "addpd     %%xmm4, %%xmm1           \n\t"
>> -                "addpd     %%xmm3, %%xmm0           \n\t"
>> -                "add       $16,    %0               \n\t"
>> -                "jl 1b                              \n\t"
>> -                "movhlps   %%xmm0, %%xmm3           \n\t"
>> -                "movhlps   %%xmm1, %%xmm4           \n\t"
>> -                "addsd     %%xmm3, %%xmm0           \n\t"
>> -                "addsd     %%xmm4, %%xmm1           \n\t"
>> -                "movsd     %%xmm0, %1               \n\t"
>> -                "movsd     %%xmm1, %2               \n\t"
>> -                :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
>> -                :"r"(data+len), "r"(data+len-j)
>> -                 NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
>> -            );
>> -        }
>> -    }
>> -}
>> -
>> -#endif /* HAVE_SSE2_INLINE */
>> +void ff_lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, 
>> int lag,
>> +                                  double *autoc);
>> +void ff_lpc_compute_autocorr_avx(const double *data, ptrdiff_t len, 
>> int lag,
>> +                                 double *autoc);
>>   av_cold void ff_lpc_init_x86(LPCContext *c)
>>   {
>>       int cpu_flags = av_get_cpu_flags();
>> -#if HAVE_SSE2_INLINE
>> -    if (INLINE_SSE2_SLOW(cpu_flags))
>> -        c->lpc_compute_autocorr = lpc_compute_autocorr_sse2;
>> -#endif
>> +    if (EXTERNAL_SSE2(cpu_flags))
>> +        c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2;
> 
> Place this with ff_lpc_apply_welch_window_sse2 below.
> 
>> +
>> +    if (EXTERNAL_AVX_FAST(cpu_flags))
>> +        c->lpc_compute_autocorr = ff_lpc_compute_autocorr_avx;
>>       if (EXTERNAL_SSE2(cpu_flags))
>>           c->lpc_apply_welch_window = ff_lpc_apply_welch_window_sse2;
Michael Niedermayer May 26, 2024, 7:43 p.m. UTC | #3
On Sun, May 26, 2024 at 03:42:01AM +0200, Lynne via ffmpeg-devel wrote:
> The inline asm function had issues running under checkasm.
> So I came to finish what I started, and wrote the last part
> of LPC computation in assembly.
> ---
>  libavcodec/x86/lpc.asm    | 91 +++++++++++++++++++++++++++++++++++++++
>  libavcodec/x86/lpc_init.c | 87 ++++---------------------------------
>  2 files changed, 100 insertions(+), 78 deletions(-)

seems to break fate
make: *** [tests/Makefile:311: fate-lavf-ogg] Error 1
make: *** [tests/Makefile:311: fate-iamf-stereo] Error 1
make: *** [tests/Makefile:311: fate-mov-mp4-iamf-stereo] Error 1
make: *** [tests/Makefile:311: fate-iamf-ambisonic_1] Error 1
make: *** [tests/Makefile:310: fate-mov-mp4-iamf-ambisonic_1] Error 1
make: *** [tests/Makefile:311: fate-mov-mp4-iamf-5_1_4] Error 1
make: *** [tests/Makefile:311: fate-iamf-5_1_4] Error 1
make: *** [tests/Makefile:311: fate-iamf-7_1_4] Error 1
make: *** [tests/Makefile:311: fate-mov-mp4-iamf-7_1_4] Error 1
make: *** [tests/Makefile:311: fate-cover-art-flac-remux] Error 1

thx

[...]
diff mbox series

Patch

diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm
index a585c17ef5..9c359ae480 100644
--- a/libavcodec/x86/lpc.asm
+++ b/libavcodec/x86/lpc.asm
@@ -261,3 +261,94 @@  APPLY_WELCH_FN
 INIT_YMM avx2
 APPLY_WELCH_FN
 %endif
+
+%macro COMPUTE_AUTOCORR_FN 0
+cglobal lpc_compute_autocorr, 4, 7, 3, data, len, lag, autoc, lag_p, data_l, len_p
+    shl lagd, 3
+    shl lenq, 3
+    xor lag_pq, lag_pq
+
+.lag_l:
+    movaps m2, [one_tab]
+
+    mov len_pq, lag_pq
+
+    lea data_lq, [lag_pq + mmsize - 8]
+    neg data_lq                     ; -j - mmsize
+    add data_lq, dataq              ; data[-j - mmsize]
+.len_l:
+
+%if mmsize == 32
+    vbroadcastsd m0, [dataq + len_pq]
+    vpermpd m1, [data_lq + len_pq], q0123
+%else
+    movupd m1, [data_lq + len_pq]   ; data[i - j]
+    movsd xm0, [dataq + len_pq]     ; data[i]
+    shufpd m1, m1, m1, 01b
+%endif
+
+    shufpd m0, m0, m0, 1100b
+
+    ; fmadd actually hurts performance in this case due to
+    ; the earlier loads + shuffles
+    mulpd m0, m1
+    addpd m2, m0                    ; sum += data[i]*data[i-j]
+
+    add len_pq, 8
+    cmp len_pq, lenq
+    jl .len_l
+
+    movupd [autocq + lag_pq], m2    ; autoc[j] = sum
+    add lag_pq, mmsize
+    cmp lag_pq, lagq
+    jl .lag_l
+
+    ; The tail computation is guaranteed never to happen
+    ; as long as we're doing multiples of 4, rather than 2.
+%if mmsize != 32
+    jg .end
+    ; If lag_p == lag fallthrough
+
+.tail:
+    movaps m2, [one_tab]
+
+    mov len_pq, lag_pq
+    sub len_pq, mmsize
+
+    lea data_lq, [lag_pq]
+    neg data_lq                     ; -j
+    add data_lq, dataq              ; data[-j]
+
+.tail_l:
+    movupd m0, [dataq + len_pq]
+    movupd m1, [data_lq + len_pq]
+
+    mulpd m0, m1
+    addpd m2, m0                    ; sum += data[i]*data[i-j]
+
+    add len_pq, mmsize
+    cmp len_pq, lenq
+    jl .tail_l
+
+    shufpd m1, m2, m2, 01b
+    addpd m2, m1
+
+    ; Leave this here just in case its ever needed
+%if mmsize == 32
+    vperm2f128 m1, m2, m2, 0x01
+    addpd xm2, xm1
+    movupd [autocq + lag_pq], xm2
+%else
+    movhpd [autocq + lag_pq], xm2
+%endif
+
+.end:
+%endif
+
+    RET
+%endmacro
+
+INIT_XMM sse2
+COMPUTE_AUTOCORR_FN
+INIT_YMM avx
+COMPUTE_AUTOCORR_FN
diff --git a/libavcodec/x86/lpc_init.c b/libavcodec/x86/lpc_init.c
index f2fca53799..bb174be53e 100644
--- a/libavcodec/x86/lpc_init.c
+++ b/libavcodec/x86/lpc_init.c
@@ -28,89 +28,20 @@  void ff_lpc_apply_welch_window_sse2(const int32_t *data, ptrdiff_t len,
                                     double *w_data);
 void ff_lpc_apply_welch_window_avx2(const int32_t *data, ptrdiff_t len,
                                     double *w_data);
-
-DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 };
-
-#if HAVE_SSE2_INLINE
-
-static void lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, int lag,
-                                      double *autoc)
-{
-    int j;
-
-    if((x86_reg)data & 15)
-        data++;
-
-    for(j=0; j<lag; j+=2){
-        x86_reg i = -len*sizeof(double);
-        if(j == lag-2) {
-            __asm__ volatile(
-                "movsd    "MANGLE(pd_1)", %%xmm0    \n\t"
-                "movsd    "MANGLE(pd_1)", %%xmm1    \n\t"
-                "movsd    "MANGLE(pd_1)", %%xmm2    \n\t"
-                "1:                                 \n\t"
-                "movapd   (%2,%0), %%xmm3           \n\t"
-                "movupd -8(%3,%0), %%xmm4           \n\t"
-                "movapd   (%3,%0), %%xmm5           \n\t"
-                "mulpd     %%xmm3, %%xmm4           \n\t"
-                "mulpd     %%xmm3, %%xmm5           \n\t"
-                "mulpd -16(%3,%0), %%xmm3           \n\t"
-                "addpd     %%xmm4, %%xmm1           \n\t"
-                "addpd     %%xmm5, %%xmm0           \n\t"
-                "addpd     %%xmm3, %%xmm2           \n\t"
-                "add       $16,    %0               \n\t"
-                "jl 1b                              \n\t"
-                "movhlps   %%xmm0, %%xmm3           \n\t"
-                "movhlps   %%xmm1, %%xmm4           \n\t"
-                "movhlps   %%xmm2, %%xmm5           \n\t"
-                "addsd     %%xmm3, %%xmm0           \n\t"
-                "addsd     %%xmm4, %%xmm1           \n\t"
-                "addsd     %%xmm5, %%xmm2           \n\t"
-                "movsd     %%xmm0,   (%1)           \n\t"
-                "movsd     %%xmm1,  8(%1)           \n\t"
-                "movsd     %%xmm2, 16(%1)           \n\t"
-                :"+&r"(i)
-                :"r"(autoc+j), "r"(data+len), "r"(data+len-j)
-                 NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
-                :"memory"
-            );
-        } else {
-            __asm__ volatile(
-                "movsd    "MANGLE(pd_1)", %%xmm0    \n\t"
-                "movsd    "MANGLE(pd_1)", %%xmm1    \n\t"
-                "1:                                 \n\t"
-                "movapd   (%3,%0), %%xmm3           \n\t"
-                "movupd -8(%4,%0), %%xmm4           \n\t"
-                "mulpd     %%xmm3, %%xmm4           \n\t"
-                "mulpd    (%4,%0), %%xmm3           \n\t"
-                "addpd     %%xmm4, %%xmm1           \n\t"
-                "addpd     %%xmm3, %%xmm0           \n\t"
-                "add       $16,    %0               \n\t"
-                "jl 1b                              \n\t"
-                "movhlps   %%xmm0, %%xmm3           \n\t"
-                "movhlps   %%xmm1, %%xmm4           \n\t"
-                "addsd     %%xmm3, %%xmm0           \n\t"
-                "addsd     %%xmm4, %%xmm1           \n\t"
-                "movsd     %%xmm0, %1               \n\t"
-                "movsd     %%xmm1, %2               \n\t"
-                :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
-                :"r"(data+len), "r"(data+len-j)
-                 NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
-            );
-        }
-    }
-}
-
-#endif /* HAVE_SSE2_INLINE */
+void ff_lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, int lag,
+                                  double *autoc);
+void ff_lpc_compute_autocorr_avx(const double *data, ptrdiff_t len, int lag,
+                                 double *autoc);
 
 av_cold void ff_lpc_init_x86(LPCContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
 
-#if HAVE_SSE2_INLINE
-    if (INLINE_SSE2_SLOW(cpu_flags))
-        c->lpc_compute_autocorr = lpc_compute_autocorr_sse2;
-#endif
+    if (EXTERNAL_SSE2(cpu_flags))
+        c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2;
+
+    if (EXTERNAL_AVX_FAST(cpu_flags))
+        c->lpc_compute_autocorr = ff_lpc_compute_autocorr_avx;
 
     if (EXTERNAL_SSE2(cpu_flags))
         c->lpc_apply_welch_window = ff_lpc_apply_welch_window_sse2;