diff mbox

[FFmpeg-devel,7/8] lavc/flacenc: add AVX2 version of the 32-bit LPC encoder

Message ID 20171126225111.5108-8-james.darnley@gmail.com
State New
Headers show

Commit Message

James Darnley Nov. 26, 2017, 10:51 p.m. UTC
When compared to the SSE4.2 version runtime, is reduced by 1 to 26%.  The
function itself is around 2 times faster.
---
 libavcodec/x86/flac_dsp_gpl.asm | 56 +++++++++++++++++++++++++++++++----------
 libavcodec/x86/flacdsp_init.c   |  5 +++-
 2 files changed, 47 insertions(+), 14 deletions(-)

Comments

Rostislav Pehlivanov Nov. 26, 2017, 11:13 p.m. UTC | #1
On 26 November 2017 at 22:51, James Darnley <james.darnley@gmail.com> wrote:

> When compared to the SSE4.2 version runtime, is reduced by 1 to 26%.  The
> function itself is around 2 times faster.
> ---
>  libavcodec/x86/flac_dsp_gpl.asm | 56 ++++++++++++++++++++++++++++++
> +----------
>  libavcodec/x86/flacdsp_init.c   |  5 +++-
>  2 files changed, 47 insertions(+), 14 deletions(-)
>
> diff --git a/libavcodec/x86/flac_dsp_gpl.asm
> b/libavcodec/x86/flac_dsp_gpl.asm
> index 91989ce560..749e66dec8 100644
> --- a/libavcodec/x86/flac_dsp_gpl.asm
> +++ b/libavcodec/x86/flac_dsp_gpl.asm
> @@ -22,11 +22,11 @@
>
>  %include "libavutil/x86/x86util.asm"
>
> -SECTION_RODATA
> +SECTION_RODATA 32
>
> -pd_0_int_min: times  2 dd 0, -2147483648
> -pq_int_min:   times  2 dq -2147483648
> -pq_int_max:   times  2 dq  2147483647
> +pd_0_int_min: times  4 dd 0, -2147483648
> +pq_int_min:   times  4 dq -2147483648
> +pq_int_max:   times  4 dq  2147483647
>
>  SECTION .text
>
> @@ -123,7 +123,10 @@ RET
>  %endmacro
>
>  %macro PMINSQ 3
> -    pcmpgtq %3, %2, %1
> +    mova    %3, %2
> +    ; We cannot use the 3-operand format because the memory location
> cannot be
> +    ; the second operand, only the third.
> +    pcmpgtq %3, %1
>

I don't get it, how did it work before then?


>      pand    %1, %3
>      pandn   %3, %2
>      por     %1, %3
> @@ -177,11 +180,11 @@ lea    resq,   [resq+orderq*4]
>  lea    smpq,   [smpq+orderq*4]
>  lea    coefsq, [coefsq+orderq*4]
>  sub    length,  orderd
> -movd   m3,      r5m
> +movd   xm3,     r5m
>  neg    orderq
>
>  movu   m4,     [pd_0_int_min] ; load 1 bit
> -psrad  m4,      m3            ; turn that into shift+1 bits
> +psrad  m4,      xm3           ; turn that into shift+1 bits
>  pslld  m4,      1             ; reduce that
>  mova  [rsp],    m4            ; save sign extend mask
>
> @@ -197,8 +200,20 @@ mova  [rsp],    m4            ; save sign extend mask
>      xor  negj, negj
>
>      .looporder1:
> +%if cpuflag(avx)
> +        vbroadcastss m2, [coefsq+posj*4]
> +%else
>          movd   m2,  [coefsq+posj*4] ; c = coefs[j]
>          SPLATD m2
> +%endif
> +%if cpuflag(avx)
> +        vpmuldq  m1, m2, [smpq+negj*4-4]
> +        vpmuldq  m5, m2, [smpq+negj*4-4+mmsize]
> +        vpmuldq  m7, m2, [smpq+negj*4-4+mmsize*2]
> +        vpaddq   m0, m1
> +        vpaddq   m4, m5
> +        vpaddq   m6, m7
>

Why force VEX encoding for these instructions, on avx no less?


> +%else
>          movu   m1,  [smpq+negj*4-4] ; s = smp[i-j-1]
>          movu   m5,  [smpq+negj*4-4+mmsize]
>          movu   m7,  [smpq+negj*4-4+mmsize*2]
> @@ -212,14 +227,15 @@ mova  [rsp],    m4            ; save sign extend mask
>          paddq  m0,   m1             ; p += c * s
>          paddq  m4,   m5
>          paddq  m6,   m7
> +%endif
>
>          dec    negj
>          inc    posj
>      jnz .looporder1
>
> -    HACK_PSRAQ m0, m3, [rsp], m2    ; p >>= shift
> -    HACK_PSRAQ m4, m3, [rsp], m2
> -    HACK_PSRAQ m6, m3, [rsp], m2
> +    HACK_PSRAQ m0, xm3, [rsp], m2    ; p >>= shift
> +    HACK_PSRAQ m4, xm3, [rsp], m2
> +    HACK_PSRAQ m6, xm3, [rsp], m2
>      CLIPQ   m0,   [pq_int_min], [pq_int_max], m2 ; clip(p >> shift)
>      CLIPQ   m4,   [pq_int_min], [pq_int_max], m2
>      CLIPQ   m6,   [pq_int_min], [pq_int_max], m2
> @@ -241,8 +257,20 @@ mova  [rsp],    m4            ; save sign extend mask
>      xor  negj, negj
>
>      .looporder2:
> +%if cpuflag(avx)
> +        vbroadcastss m2, [coefsq+posj*4]
> +%else
>          movd   m2,  [coefsq+posj*4] ; c = coefs[j]
>          SPLATD m2
> +%endif
> +%if cpuflag(avx)
> +        vpmuldq  m1, m2, [smpq+negj*4]
> +        vpmuldq  m5, m2, [smpq+negj*4+mmsize]
> +        vpmuldq  m7, m2, [smpq+negj*4+mmsize*2]
> +        vpaddq   m0, m1
> +        vpaddq   m4, m5
> +        vpaddq   m6, m7
> +%else
>          movu   m1,  [smpq+negj*4] ; s = smp[i-j-1]
>          movu   m5,  [smpq+negj*4+mmsize]
>          movu   m7,  [smpq+negj*4+mmsize*2]
> @@ -252,14 +280,15 @@ mova  [rsp],    m4            ; save sign extend mask
>          paddq  m0,   m1             ; p += c * s
>          paddq  m4,   m5
>          paddq  m6,   m7
> +%endif
>
>          dec    negj
>          inc    posj
>      jnz .looporder2
>
> -    HACK_PSRAQ m0, m3, [rsp], m2    ; p >>= shift
> -    HACK_PSRAQ m4, m3, [rsp], m2
> -    HACK_PSRAQ m6, m3, [rsp], m2
> +    HACK_PSRAQ m0, xm3, [rsp], m2    ; p >>= shift
> +    HACK_PSRAQ m4, xm3, [rsp], m2
> +    HACK_PSRAQ m6, xm3, [rsp], m2
>      CLIPQ   m0,   [pq_int_min], [pq_int_max], m2 ; clip(p >> shift)
>      CLIPQ   m4,   [pq_int_min], [pq_int_max], m2
>      CLIPQ   m6,   [pq_int_min], [pq_int_max], m2
> @@ -300,3 +329,4 @@ FUNCTION_BODY_32
>
>  INIT_YMM avx2
>  FUNCTION_BODY_16
> +FUNCTION_BODY_32
> diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c
> index f827186c26..fbe70894a0 100644
> --- a/libavcodec/x86/flacdsp_init.c
> +++ b/libavcodec/x86/flacdsp_init.c
> @@ -30,6 +30,7 @@ void ff_flac_lpc_32_xop(int32_t *samples, const int
> coeffs[32], int order,
>  void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const
> int32_t *,int);
>  void ff_flac_enc_lpc_16_avx2(int32_t *, const int32_t *, int, int, const
> int32_t *,int);
>  void ff_flac_enc_lpc_32_sse42(int32_t *, const int32_t *, int, int,
> const int32_t *,int);
> +void ff_flac_enc_lpc_32_avx2(int32_t *, const int32_t *, int, int, const
> int32_t *,int);
>
>  #define DECORRELATE_FUNCS(fmt, opt)
>                 \
>  void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in,
> int channels,     \
> @@ -117,8 +118,10 @@ av_cold void ff_flacdsp_init_x86(FLACDSPContext *c,
> enum AVSampleFormat fmt, int
>              c->lpc32_encode = ff_flac_enc_lpc_32_sse42;
>      }
>      if (EXTERNAL_AVX2(cpu_flags)) {
> -        if (CONFIG_GPL)
> +        if (CONFIG_GPL) {
>              c->lpc16_encode = ff_flac_enc_lpc_16_avx2;
> +            c->lpc32_encode = ff_flac_enc_lpc_32_avx2;
> +        }
>      }
>  #endif
>  #endif /* HAVE_X86ASM */
> --
> 2.15.0
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
James Darnley Nov. 26, 2017, 11:26 p.m. UTC | #2
On 2017-11-27 00:13, Rostislav Pehlivanov wrote:
> On 26 November 2017 at 22:51, James Darnley <james.darnley@gmail.com> wrote:
>> @@ -123,7 +123,10 @@ RET
>>  %endmacro
>>
>>  %macro PMINSQ 3
>> -    pcmpgtq %3, %2, %1
>> +    mova    %3, %2
>> +    ; We cannot use the 3-operand format because the memory location
>> cannot be
>> +    ; the second operand, only the third.
>> +    pcmpgtq %3, %1
>>
> 
> I don't get it, how did it work before then?

Easy.  3-operand instructions were never generated using it meaning it
was always emulated with a move.

>> @@ -197,8 +200,20 @@ mova  [rsp],    m4            ; save sign extend mask
>>      xor  negj, negj
>>
>>      .looporder1:
>> +%if cpuflag(avx)
>> +        vbroadcastss m2, [coefsq+posj*4]
>> +%else
>>          movd   m2,  [coefsq+posj*4] ; c = coefs[j]
>>          SPLATD m2
>> +%endif
>> +%if cpuflag(avx)
>> +        vpmuldq  m1, m2, [smpq+negj*4-4]
>> +        vpmuldq  m5, m2, [smpq+negj*4-4+mmsize]
>> +        vpmuldq  m7, m2, [smpq+negj*4-4+mmsize*2]
>> +        vpaddq   m0, m1
>> +        vpaddq   m4, m5
>> +        vpaddq   m6, m7
>>
> 
> Why force VEX encoding for these instructions, on avx no less?

Not sure.  Legacy code written before I knew what I was doing?  Perhaps
some issue arose with the assembler or x86inc at that time and this is
how I worked around it.
James Almer Nov. 26, 2017, 11:29 p.m. UTC | #3
On 11/26/2017 7:51 PM, James Darnley wrote:
> When compared to the SSE4.2 version runtime, is reduced by 1 to 26%.  The
> function itself is around 2 times faster.
> ---
>  libavcodec/x86/flac_dsp_gpl.asm | 56 +++++++++++++++++++++++++++++++----------
>  libavcodec/x86/flacdsp_init.c   |  5 +++-
>  2 files changed, 47 insertions(+), 14 deletions(-)
> 
> diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
> index 91989ce560..749e66dec8 100644
> --- a/libavcodec/x86/flac_dsp_gpl.asm
> +++ b/libavcodec/x86/flac_dsp_gpl.asm
> @@ -22,11 +22,11 @@
>  
>  %include "libavutil/x86/x86util.asm"
>  
> -SECTION_RODATA
> +SECTION_RODATA 32
>  
> -pd_0_int_min: times  2 dd 0, -2147483648
> -pq_int_min:   times  2 dq -2147483648
> -pq_int_max:   times  2 dq  2147483647
> +pd_0_int_min: times  4 dd 0, -2147483648
> +pq_int_min:   times  4 dq -2147483648
> +pq_int_max:   times  4 dq  2147483647
>  
>  SECTION .text
>  
> @@ -123,7 +123,10 @@ RET
>  %endmacro
>  
>  %macro PMINSQ 3
> -    pcmpgtq %3, %2, %1
> +    mova    %3, %2
> +    ; We cannot use the 3-operand format because the memory location cannot be
> +    ; the second operand, only the third.
> +    pcmpgtq %3, %1
>      pand    %1, %3
>      pandn   %3, %2
>      por     %1, %3
> @@ -177,11 +180,11 @@ lea    resq,   [resq+orderq*4]
>  lea    smpq,   [smpq+orderq*4]
>  lea    coefsq, [coefsq+orderq*4]
>  sub    length,  orderd
> -movd   m3,      r5m
> +movd   xm3,     r5m
>  neg    orderq
>  
>  movu   m4,     [pd_0_int_min] ; load 1 bit
> -psrad  m4,      m3            ; turn that into shift+1 bits
> +psrad  m4,      xm3           ; turn that into shift+1 bits
>  pslld  m4,      1             ; reduce that
>  mova  [rsp],    m4            ; save sign extend mask
>  
> @@ -197,8 +200,20 @@ mova  [rsp],    m4            ; save sign extend mask
>      xor  negj, negj
>  
>      .looporder1:
> +%if cpuflag(avx)

Either avx2, or check instead for mmsize == 32

> +        vbroadcastss m2, [coefsq+posj*4]

vpbroadcastd. Or just use the VPBROADCASTD macro to cover both the avx2
and sse4 cases without ifdeffery.

> +%else
>          movd   m2,  [coefsq+posj*4] ; c = coefs[j]
>          SPLATD m2
> +%endif
> +%if cpuflag(avx)
> +        vpmuldq  m1, m2, [smpq+negj*4-4]
> +        vpmuldq  m5, m2, [smpq+negj*4-4+mmsize]
> +        vpmuldq  m7, m2, [smpq+negj*4-4+mmsize*2]
> +        vpaddq   m0, m1
> +        vpaddq   m4, m5
> +        vpaddq   m6, m7
> +%else
>          movu   m1,  [smpq+negj*4-4] ; s = smp[i-j-1]
>          movu   m5,  [smpq+negj*4-4+mmsize]
>          movu   m7,  [smpq+negj*4-4+mmsize*2]
> @@ -212,14 +227,15 @@ mova  [rsp],    m4            ; save sign extend mask
>          paddq  m0,   m1             ; p += c * s
>          paddq  m4,   m5
>          paddq  m6,   m7
> +%endif
>  
>          dec    negj
>          inc    posj
>      jnz .looporder1
>  
> -    HACK_PSRAQ m0, m3, [rsp], m2    ; p >>= shift
> -    HACK_PSRAQ m4, m3, [rsp], m2
> -    HACK_PSRAQ m6, m3, [rsp], m2
> +    HACK_PSRAQ m0, xm3, [rsp], m2    ; p >>= shift
> +    HACK_PSRAQ m4, xm3, [rsp], m2
> +    HACK_PSRAQ m6, xm3, [rsp], m2
>      CLIPQ   m0,   [pq_int_min], [pq_int_max], m2 ; clip(p >> shift)
>      CLIPQ   m4,   [pq_int_min], [pq_int_max], m2
>      CLIPQ   m6,   [pq_int_min], [pq_int_max], m2
> @@ -241,8 +257,20 @@ mova  [rsp],    m4            ; save sign extend mask
>      xor  negj, negj
>  
>      .looporder2:
> +%if cpuflag(avx)
> +        vbroadcastss m2, [coefsq+posj*4]

Same

> +%else
>          movd   m2,  [coefsq+posj*4] ; c = coefs[j]
>          SPLATD m2
> +%endif
> +%if cpuflag(avx)
> +        vpmuldq  m1, m2, [smpq+negj*4]
> +        vpmuldq  m5, m2, [smpq+negj*4+mmsize]
> +        vpmuldq  m7, m2, [smpq+negj*4+mmsize*2]
> +        vpaddq   m0, m1
> +        vpaddq   m4, m5
> +        vpaddq   m6, m7
> +%else
>          movu   m1,  [smpq+negj*4] ; s = smp[i-j-1]
>          movu   m5,  [smpq+negj*4+mmsize]
>          movu   m7,  [smpq+negj*4+mmsize*2]
> @@ -252,14 +280,15 @@ mova  [rsp],    m4            ; save sign extend mask
>          paddq  m0,   m1             ; p += c * s
>          paddq  m4,   m5
>          paddq  m6,   m7
> +%endif
>  
>          dec    negj
>          inc    posj
>      jnz .looporder2
>  
> -    HACK_PSRAQ m0, m3, [rsp], m2    ; p >>= shift
> -    HACK_PSRAQ m4, m3, [rsp], m2
> -    HACK_PSRAQ m6, m3, [rsp], m2
> +    HACK_PSRAQ m0, xm3, [rsp], m2    ; p >>= shift
> +    HACK_PSRAQ m4, xm3, [rsp], m2
> +    HACK_PSRAQ m6, xm3, [rsp], m2
>      CLIPQ   m0,   [pq_int_min], [pq_int_max], m2 ; clip(p >> shift)
>      CLIPQ   m4,   [pq_int_min], [pq_int_max], m2
>      CLIPQ   m6,   [pq_int_min], [pq_int_max], m2
> @@ -300,3 +329,4 @@ FUNCTION_BODY_32
>  
>  INIT_YMM avx2
>  FUNCTION_BODY_16
> +FUNCTION_BODY_32
> diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c
> index f827186c26..fbe70894a0 100644
> --- a/libavcodec/x86/flacdsp_init.c
> +++ b/libavcodec/x86/flacdsp_init.c
> @@ -30,6 +30,7 @@ void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
>  void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const int32_t *,int);
>  void ff_flac_enc_lpc_16_avx2(int32_t *, const int32_t *, int, int, const int32_t *,int);
>  void ff_flac_enc_lpc_32_sse42(int32_t *, const int32_t *, int, int, const int32_t *,int);
> +void ff_flac_enc_lpc_32_avx2(int32_t *, const int32_t *, int, int, const int32_t *,int);
>  
>  #define DECORRELATE_FUNCS(fmt, opt)                                                      \
>  void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
> @@ -117,8 +118,10 @@ av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int
>              c->lpc32_encode = ff_flac_enc_lpc_32_sse42;
>      }
>      if (EXTERNAL_AVX2(cpu_flags)) {
> -        if (CONFIG_GPL)
> +        if (CONFIG_GPL) {
>              c->lpc16_encode = ff_flac_enc_lpc_16_avx2;
> +            c->lpc32_encode = ff_flac_enc_lpc_32_avx2;
> +        }
>      }
>  #endif
>  #endif /* HAVE_X86ASM */
>
James Almer Nov. 26, 2017, 11:42 p.m. UTC | #4
On 11/26/2017 8:13 PM, Rostislav Pehlivanov wrote:
> On 26 November 2017 at 22:51, James Darnley <james.darnley@gmail.com> wrote:
> 
>> When compared to the SSE4.2 version runtime, is reduced by 1 to 26%.  The
>> function itself is around 2 times faster.
>> ---
>>  libavcodec/x86/flac_dsp_gpl.asm | 56 ++++++++++++++++++++++++++++++
>> +----------
>>  libavcodec/x86/flacdsp_init.c   |  5 +++-
>>  2 files changed, 47 insertions(+), 14 deletions(-)
>>
>> diff --git a/libavcodec/x86/flac_dsp_gpl.asm
>> b/libavcodec/x86/flac_dsp_gpl.asm
>> index 91989ce560..749e66dec8 100644
>> --- a/libavcodec/x86/flac_dsp_gpl.asm
>> +++ b/libavcodec/x86/flac_dsp_gpl.asm
>> @@ -22,11 +22,11 @@
>>
>>  %include "libavutil/x86/x86util.asm"
>>
>> -SECTION_RODATA
>> +SECTION_RODATA 32
>>
>> -pd_0_int_min: times  2 dd 0, -2147483648
>> -pq_int_min:   times  2 dq -2147483648
>> -pq_int_max:   times  2 dq  2147483647
>> +pd_0_int_min: times  4 dd 0, -2147483648
>> +pq_int_min:   times  4 dq -2147483648
>> +pq_int_max:   times  4 dq  2147483647
>>
>>  SECTION .text
>>
>> @@ -123,7 +123,10 @@ RET
>>  %endmacro
>>
>>  %macro PMINSQ 3
>> -    pcmpgtq %3, %2, %1
>> +    mova    %3, %2
>> +    ; We cannot use the 3-operand format because the memory location
>> cannot be
>> +    ; the second operand, only the third.
>> +    pcmpgtq %3, %1
>>
> 
> I don't get it, how did it work before then?
> 
> 
>>      pand    %1, %3
>>      pandn   %3, %2
>>      por     %1, %3
>> @@ -177,11 +180,11 @@ lea    resq,   [resq+orderq*4]
>>  lea    smpq,   [smpq+orderq*4]
>>  lea    coefsq, [coefsq+orderq*4]
>>  sub    length,  orderd
>> -movd   m3,      r5m
>> +movd   xm3,     r5m
>>  neg    orderq
>>
>>  movu   m4,     [pd_0_int_min] ; load 1 bit
>> -psrad  m4,      m3            ; turn that into shift+1 bits
>> +psrad  m4,      xm3           ; turn that into shift+1 bits
>>  pslld  m4,      1             ; reduce that
>>  mova  [rsp],    m4            ; save sign extend mask
>>
>> @@ -197,8 +200,20 @@ mova  [rsp],    m4            ; save sign extend mask
>>      xor  negj, negj
>>
>>      .looporder1:
>> +%if cpuflag(avx)
>> +        vbroadcastss m2, [coefsq+posj*4]
>> +%else
>>          movd   m2,  [coefsq+posj*4] ; c = coefs[j]
>>          SPLATD m2
>> +%endif
>> +%if cpuflag(avx)
>> +        vpmuldq  m1, m2, [smpq+negj*4-4]
>> +        vpmuldq  m5, m2, [smpq+negj*4-4+mmsize]
>> +        vpmuldq  m7, m2, [smpq+negj*4-4+mmsize*2]
>> +        vpaddq   m0, m1
>> +        vpaddq   m4, m5
>> +        vpaddq   m6, m7
>>
> 
> Why force VEX encoding for these instructions, on avx no less?

It's avx2 and using ymm regs, not avx.
Henrik Gramner Nov. 27, 2017, 4:50 p.m. UTC | #5
On Sun, Nov 26, 2017 at 11:51 PM, James Darnley <james.darnley@gmail.com> wrote:
> -pd_0_int_min: times  2 dd 0, -2147483648
> -pq_int_min:   times  2 dq -2147483648
> -pq_int_max:   times  2 dq  2147483647
> +pd_0_int_min: times  4 dd 0, -2147483648
> +pq_int_min:   times  4 dq -2147483648
> +pq_int_max:   times  4 dq  2147483647

Using 128-bit broadcasts is preferable over duplicating the constants
to 256-bit unless there's a good reason for doing so since it wastes
less cache and is faster on AMD CPU:s.
James Almer Nov. 27, 2017, 5 p.m. UTC | #6
On 11/27/2017 1:50 PM, Henrik Gramner wrote:
> On Sun, Nov 26, 2017 at 11:51 PM, James Darnley <james.darnley@gmail.com> wrote:
>> -pd_0_int_min: times  2 dd 0, -2147483648
>> -pq_int_min:   times  2 dq -2147483648
>> -pq_int_max:   times  2 dq  2147483647
>> +pd_0_int_min: times  4 dd 0, -2147483648
>> +pq_int_min:   times  4 dq -2147483648
>> +pq_int_max:   times  4 dq  2147483647
> 
> Using 128-bit broadcasts is preferable over duplicating the constants
> to 256-bit unless there's a good reason for doing so since it wastes
> less cache and is faster on AMD CPU:s.

What would that reason be? Afaik broadcasts are expensive, since they
both load from memory then splat data across lanes. Using them inside
loops doesn't sound like a good idea. But i guess you have more
experience testing with more varied chips than i do.

Also, by AMD cpus you mean Ryzen? Because on Bulldozer-based CPUs we
purposely disabled functions using ymm regs.
Henrik Gramner Nov. 27, 2017, 5:12 p.m. UTC | #7
>> Using 128-bit broadcasts is preferable over duplicating the constants
>> to 256-bit unless there's a good reason for doing so since it wastes
>> less cache and is faster on AMD CPU:s.
>
> What would that reason be? Afaik broadcasts are expensive, since they
> both load from memory then splat data across lanes. Using them inside
> loops doesn't sound like a good idea. But i guess you have more
> experience testing with more varied chips than i do.

128-bit broadcasts from memory are done in the load unit for free on
all AVX2-capable CPU:s.

> Also, by AMD cpus you mean Ryzen? Because on Bulldozer-based CPUs we
> purposely disabled functions using ymm regs.

Yes. 128-bit broadcasts have twice the throughput compared to 256-bit
loads on Ryzen since it only has 128-bit load units.
James Darnley Dec. 2, 2017, 11:50 a.m. UTC | #8
On 2017-11-27 17:50, Henrik Gramner wrote:
> On Sun, Nov 26, 2017 at 11:51 PM, James Darnley <james.darnley@gmail.com> wrote:
>> -pd_0_int_min: times  2 dd 0, -2147483648
>> -pq_int_min:   times  2 dq -2147483648
>> -pq_int_max:   times  2 dq  2147483647
>> +pd_0_int_min: times  4 dd 0, -2147483648
>> +pq_int_min:   times  4 dq -2147483648
>> +pq_int_max:   times  4 dq  2147483647
> 
> Using 128-bit broadcasts is preferable over duplicating the constants
> to 256-bit unless there's a good reason for doing so since it wastes
> less cache and is faster on AMD CPU:s.

At first I  thought it sounded like a possible candidate for x86-64
optimisation; I have run out of registers on x86.  Although that is in
the inner loop and these constants used in the outer loop or just once
so I have some room.

Do you want to block the patch set while I change this or could it be
left for another time?

Thanks for the suggestion anyway.
diff mbox

Patch

diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
index 91989ce560..749e66dec8 100644
--- a/libavcodec/x86/flac_dsp_gpl.asm
+++ b/libavcodec/x86/flac_dsp_gpl.asm
@@ -22,11 +22,11 @@ 
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
 
-pd_0_int_min: times  2 dd 0, -2147483648
-pq_int_min:   times  2 dq -2147483648
-pq_int_max:   times  2 dq  2147483647
+pd_0_int_min: times  4 dd 0, -2147483648
+pq_int_min:   times  4 dq -2147483648
+pq_int_max:   times  4 dq  2147483647
 
 SECTION .text
 
@@ -123,7 +123,10 @@  RET
 %endmacro
 
 %macro PMINSQ 3
-    pcmpgtq %3, %2, %1
+    mova    %3, %2
+    ; We cannot use the 3-operand format because the memory location cannot be
+    ; the second operand, only the third.
+    pcmpgtq %3, %1
     pand    %1, %3
     pandn   %3, %2
     por     %1, %3
@@ -177,11 +180,11 @@  lea    resq,   [resq+orderq*4]
 lea    smpq,   [smpq+orderq*4]
 lea    coefsq, [coefsq+orderq*4]
 sub    length,  orderd
-movd   m3,      r5m
+movd   xm3,     r5m
 neg    orderq
 
 movu   m4,     [pd_0_int_min] ; load 1 bit
-psrad  m4,      m3            ; turn that into shift+1 bits
+psrad  m4,      xm3           ; turn that into shift+1 bits
 pslld  m4,      1             ; reduce that
 mova  [rsp],    m4            ; save sign extend mask
 
@@ -197,8 +200,20 @@  mova  [rsp],    m4            ; save sign extend mask
     xor  negj, negj
 
     .looporder1:
+%if cpuflag(avx)
+        vbroadcastss m2, [coefsq+posj*4]
+%else
         movd   m2,  [coefsq+posj*4] ; c = coefs[j]
         SPLATD m2
+%endif
+%if cpuflag(avx)
+        vpmuldq  m1, m2, [smpq+negj*4-4]
+        vpmuldq  m5, m2, [smpq+negj*4-4+mmsize]
+        vpmuldq  m7, m2, [smpq+negj*4-4+mmsize*2]
+        vpaddq   m0, m1
+        vpaddq   m4, m5
+        vpaddq   m6, m7
+%else
         movu   m1,  [smpq+negj*4-4] ; s = smp[i-j-1]
         movu   m5,  [smpq+negj*4-4+mmsize]
         movu   m7,  [smpq+negj*4-4+mmsize*2]
@@ -212,14 +227,15 @@  mova  [rsp],    m4            ; save sign extend mask
         paddq  m0,   m1             ; p += c * s
         paddq  m4,   m5
         paddq  m6,   m7
+%endif
 
         dec    negj
         inc    posj
     jnz .looporder1
 
-    HACK_PSRAQ m0, m3, [rsp], m2    ; p >>= shift
-    HACK_PSRAQ m4, m3, [rsp], m2
-    HACK_PSRAQ m6, m3, [rsp], m2
+    HACK_PSRAQ m0, xm3, [rsp], m2    ; p >>= shift
+    HACK_PSRAQ m4, xm3, [rsp], m2
+    HACK_PSRAQ m6, xm3, [rsp], m2
     CLIPQ   m0,   [pq_int_min], [pq_int_max], m2 ; clip(p >> shift)
     CLIPQ   m4,   [pq_int_min], [pq_int_max], m2
     CLIPQ   m6,   [pq_int_min], [pq_int_max], m2
@@ -241,8 +257,20 @@  mova  [rsp],    m4            ; save sign extend mask
     xor  negj, negj
 
     .looporder2:
+%if cpuflag(avx)
+        vbroadcastss m2, [coefsq+posj*4]
+%else
         movd   m2,  [coefsq+posj*4] ; c = coefs[j]
         SPLATD m2
+%endif
+%if cpuflag(avx)
+        vpmuldq  m1, m2, [smpq+negj*4]
+        vpmuldq  m5, m2, [smpq+negj*4+mmsize]
+        vpmuldq  m7, m2, [smpq+negj*4+mmsize*2]
+        vpaddq   m0, m1
+        vpaddq   m4, m5
+        vpaddq   m6, m7
+%else
         movu   m1,  [smpq+negj*4] ; s = smp[i-j-1]
         movu   m5,  [smpq+negj*4+mmsize]
         movu   m7,  [smpq+negj*4+mmsize*2]
@@ -252,14 +280,15 @@  mova  [rsp],    m4            ; save sign extend mask
         paddq  m0,   m1             ; p += c * s
         paddq  m4,   m5
         paddq  m6,   m7
+%endif
 
         dec    negj
         inc    posj
     jnz .looporder2
 
-    HACK_PSRAQ m0, m3, [rsp], m2    ; p >>= shift
-    HACK_PSRAQ m4, m3, [rsp], m2
-    HACK_PSRAQ m6, m3, [rsp], m2
+    HACK_PSRAQ m0, xm3, [rsp], m2    ; p >>= shift
+    HACK_PSRAQ m4, xm3, [rsp], m2
+    HACK_PSRAQ m6, xm3, [rsp], m2
     CLIPQ   m0,   [pq_int_min], [pq_int_max], m2 ; clip(p >> shift)
     CLIPQ   m4,   [pq_int_min], [pq_int_max], m2
     CLIPQ   m6,   [pq_int_min], [pq_int_max], m2
@@ -300,3 +329,4 @@  FUNCTION_BODY_32
 
 INIT_YMM avx2
 FUNCTION_BODY_16
+FUNCTION_BODY_32
diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c
index f827186c26..fbe70894a0 100644
--- a/libavcodec/x86/flacdsp_init.c
+++ b/libavcodec/x86/flacdsp_init.c
@@ -30,6 +30,7 @@  void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
 void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const int32_t *,int);
 void ff_flac_enc_lpc_16_avx2(int32_t *, const int32_t *, int, int, const int32_t *,int);
 void ff_flac_enc_lpc_32_sse42(int32_t *, const int32_t *, int, int, const int32_t *,int);
+void ff_flac_enc_lpc_32_avx2(int32_t *, const int32_t *, int, int, const int32_t *,int);
 
 #define DECORRELATE_FUNCS(fmt, opt)                                                      \
 void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
@@ -117,8 +118,10 @@  av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int
             c->lpc32_encode = ff_flac_enc_lpc_32_sse42;
     }
     if (EXTERNAL_AVX2(cpu_flags)) {
-        if (CONFIG_GPL)
+        if (CONFIG_GPL) {
             c->lpc16_encode = ff_flac_enc_lpc_16_avx2;
+            c->lpc32_encode = ff_flac_enc_lpc_32_avx2;
+        }
     }
 #endif
 #endif /* HAVE_X86ASM */