diff mbox series

[FFmpeg-devel] x86/aacencdsp: add SSE2 and AVX versions of quantize_bands

Message ID 20240604012343.1771-1-jamrial@gmail.com
State New
Headers show
Series [FFmpeg-devel] x86/aacencdsp: add SSE2 and AVX versions of quantize_bands | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

James Almer June 4, 2024, 1:23 a.m. UTC
quant_bands_signed_sse2: 417.0
quant_bands_signed_avx: 202.0

Signed-off-by: James Almer <jamrial@gmail.com>
---
 libavcodec/aacenc.h             |  2 +-
 libavcodec/x86/aacencdsp.asm    | 27 ++++++++++++++++++++++++---
 libavcodec/x86/aacencdsp_init.c |  6 ++++++
 tests/checkasm/aacencdsp.c      |  4 ++--
 4 files changed, 33 insertions(+), 6 deletions(-)

Comments

Andreas Rheinhardt June 4, 2024, 1:42 a.m. UTC | #1
James Almer:
> quant_bands_signed_sse2: 417.0
> quant_bands_signed_avx: 202.0

Missing benchmark numbers for the C code

> 
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
>  libavcodec/aacenc.h             |  2 +-
>  libavcodec/x86/aacencdsp.asm    | 27 ++++++++++++++++++++++++---
>  libavcodec/x86/aacencdsp_init.c |  6 ++++++
>  tests/checkasm/aacencdsp.c      |  4 ++--
>  4 files changed, 33 insertions(+), 6 deletions(-)
> 
> diff --git a/libavcodec/aacenc.h b/libavcodec/aacenc.h
> index d07960620e..ae15f91e06 100644
> --- a/libavcodec/aacenc.h
> +++ b/libavcodec/aacenc.h
> @@ -242,7 +242,7 @@ typedef struct AACEncContext {
>      enum RawDataBlockType cur_type;              ///< channel group type cur_channel belongs to
>  
>      AudioFrameQueue afq;
> -    DECLARE_ALIGNED(16, int,   qcoefs)[96];      ///< quantized coefficients
> +    DECLARE_ALIGNED(32, int,   qcoefs)[96];      ///< quantized coefficients
>      DECLARE_ALIGNED(32, float, scoefs)[1024];    ///< scaled coefficients
>  
>      uint16_t quantize_band_cost_cache_generation;
> diff --git a/libavcodec/x86/aacencdsp.asm b/libavcodec/x86/aacencdsp.asm
> index 0d3ba4b89d..99be2d87f5 100644
> --- a/libavcodec/x86/aacencdsp.asm
> +++ b/libavcodec/x86/aacencdsp.asm
> @@ -53,8 +53,19 @@ cglobal abs_pow34, 3, 3, 3, out, in, size
>  ;                           int size, int is_signed, int maxval, const float Q34,
>  ;                           const float rounding)
>  ;*******************************************************************
> -INIT_XMM sse2
> +%macro AAC_QUANTIZE_BANDS 0
>  cglobal aac_quantize_bands, 5, 5, 6, out, in, scaled, size, is_signed, maxval, Q34, rounding
> +%if mmsize == 32
> +    vbroadcastss m0, Q34m
> +    vbroadcastss m1, roundingm
> +%if UNIX64 == 0
> +    cvtsi2ss xm3, dword maxvalm
> +%else
> +    cvtsi2ss xm3, maxvald
> +%endif
> +    shufps   xm3, xm3, xm3, 0
> +    vinsertf128 m3, m3, xm3, 1
> +%else ; mmsize == 16
>  %if UNIX64 == 0
>      movss     m0, Q34m
>      movss     m1, roundingm
> @@ -65,9 +76,13 @@ cglobal aac_quantize_bands, 5, 5, 6, out, in, scaled, size, is_signed, maxval, Q
>      shufps    m0, m0, 0
>      shufps    m1, m1, 0
>      shufps    m3, m3, 0
> +%endif
>      shl       is_signedd, 31
> -    movd      m4, is_signedd
> -    shufps    m4, m4, 0
> +    movd     xm4, is_signedd
> +    shufps   xm4, xm4, xm4, 0
> +%if mmsize == 32
> +    vinsertf128 m4, m4, xm4, 1
> +%endif
>      shl       sized,   2
>      add       inq, sizeq
>      add       outq, sizeq
> @@ -84,3 +99,9 @@ cglobal aac_quantize_bands, 5, 5, 6, out, in, scaled, size, is_signed, maxval, Q
>      add       sizeq, mmsize
>      jl       .loop
>      RET
> +%endmacro
> +
> +INIT_XMM sse2
> +AAC_QUANTIZE_BANDS
> +INIT_YMM avx
> +AAC_QUANTIZE_BANDS
> diff --git a/libavcodec/x86/aacencdsp_init.c b/libavcodec/x86/aacencdsp_init.c
> index e0d8dec4f8..cf17dbf91d 100644
> --- a/libavcodec/x86/aacencdsp_init.c
> +++ b/libavcodec/x86/aacencdsp_init.c
> @@ -30,6 +30,9 @@ void ff_abs_pow34_sse(float *out, const float *in, const int size);
>  void ff_aac_quantize_bands_sse2(int *out, const float *in, const float *scaled,
>                                  int size, int is_signed, int maxval, const float Q34,
>                                  const float rounding);
> +void ff_aac_quantize_bands_avx(int *out, const float *in, const float *scaled,
> +                               int size, int is_signed, int maxval, const float Q34,
> +                               const float rounding);
>  
>  av_cold void ff_aacenc_dsp_init_x86(AACEncDSPContext *s)
>  {
> @@ -40,4 +43,7 @@ av_cold void ff_aacenc_dsp_init_x86(AACEncDSPContext *s)
>  
>      if (EXTERNAL_SSE2(cpu_flags))
>          s->quant_bands = ff_aac_quantize_bands_sse2;

Seems like the commit message is wrong: You are not adding an SSE2 version.

> +
> +    if (EXTERNAL_AVX_FAST(cpu_flags))
> +        s->quant_bands = ff_aac_quantize_bands_avx;
>  }
> diff --git a/tests/checkasm/aacencdsp.c b/tests/checkasm/aacencdsp.c
> index 791dd30320..5308a2ac03 100644
> --- a/tests/checkasm/aacencdsp.c
> +++ b/tests/checkasm/aacencdsp.c
> @@ -81,8 +81,8 @@ static void test_quant_bands(AACEncDSPContext *s)
>      for (int sign = 0; sign <= 1; sign++) {
>          if (check_func(s->quant_bands, "quant_bands_%s",
>                         sign ? "signed" : "unsigned")) {
> -            LOCAL_ALIGNED_16(int, out, [BUF_SIZE]);
> -            LOCAL_ALIGNED_16(int, out2, [BUF_SIZE]);
> +            LOCAL_ALIGNED_32(int, out, [BUF_SIZE]);
> +            LOCAL_ALIGNED_32(int, out2, [BUF_SIZE]);
>  
>              call_ref(out, in, scaled, BUF_SIZE, sign, maxval, q34, rounding);
>              call_new(out2, in, scaled, BUF_SIZE, sign, maxval, q34, rounding);
James Almer June 4, 2024, 1:45 a.m. UTC | #2
On 6/3/2024 10:42 PM, Andreas Rheinhardt wrote:
> James Almer:
>> quant_bands_signed_sse2: 417.0
>> quant_bands_signed_avx: 202.0
> 
> Missing benchmark numbers for the C code

About 1670. And it doesn't matter as I'm only adding the AVX version 
(The subject is wrong, copy-paste fail), so i mentioned the SSE2 as 
comparison to the existing simd version.

But sure, i can add the C one before pushing.

> 
>>
>> Signed-off-by: James Almer <jamrial@gmail.com>
>> ---
>>   libavcodec/aacenc.h             |  2 +-
>>   libavcodec/x86/aacencdsp.asm    | 27 ++++++++++++++++++++++++---
>>   libavcodec/x86/aacencdsp_init.c |  6 ++++++
>>   tests/checkasm/aacencdsp.c      |  4 ++--
>>   4 files changed, 33 insertions(+), 6 deletions(-)
>>
>> diff --git a/libavcodec/aacenc.h b/libavcodec/aacenc.h
>> index d07960620e..ae15f91e06 100644
>> --- a/libavcodec/aacenc.h
>> +++ b/libavcodec/aacenc.h
>> @@ -242,7 +242,7 @@ typedef struct AACEncContext {
>>       enum RawDataBlockType cur_type;              ///< channel group type cur_channel belongs to
>>   
>>       AudioFrameQueue afq;
>> -    DECLARE_ALIGNED(16, int,   qcoefs)[96];      ///< quantized coefficients
>> +    DECLARE_ALIGNED(32, int,   qcoefs)[96];      ///< quantized coefficients
>>       DECLARE_ALIGNED(32, float, scoefs)[1024];    ///< scaled coefficients
>>   
>>       uint16_t quantize_band_cost_cache_generation;
>> diff --git a/libavcodec/x86/aacencdsp.asm b/libavcodec/x86/aacencdsp.asm
>> index 0d3ba4b89d..99be2d87f5 100644
>> --- a/libavcodec/x86/aacencdsp.asm
>> +++ b/libavcodec/x86/aacencdsp.asm
>> @@ -53,8 +53,19 @@ cglobal abs_pow34, 3, 3, 3, out, in, size
>>   ;                           int size, int is_signed, int maxval, const float Q34,
>>   ;                           const float rounding)
>>   ;*******************************************************************
>> -INIT_XMM sse2
>> +%macro AAC_QUANTIZE_BANDS 0
>>   cglobal aac_quantize_bands, 5, 5, 6, out, in, scaled, size, is_signed, maxval, Q34, rounding
>> +%if mmsize == 32
>> +    vbroadcastss m0, Q34m
>> +    vbroadcastss m1, roundingm
>> +%if UNIX64 == 0
>> +    cvtsi2ss xm3, dword maxvalm
>> +%else
>> +    cvtsi2ss xm3, maxvald
>> +%endif
>> +    shufps   xm3, xm3, xm3, 0
>> +    vinsertf128 m3, m3, xm3, 1
>> +%else ; mmsize == 16
>>   %if UNIX64 == 0
>>       movss     m0, Q34m
>>       movss     m1, roundingm
>> @@ -65,9 +76,13 @@ cglobal aac_quantize_bands, 5, 5, 6, out, in, scaled, size, is_signed, maxval, Q
>>       shufps    m0, m0, 0
>>       shufps    m1, m1, 0
>>       shufps    m3, m3, 0
>> +%endif
>>       shl       is_signedd, 31
>> -    movd      m4, is_signedd
>> -    shufps    m4, m4, 0
>> +    movd     xm4, is_signedd
>> +    shufps   xm4, xm4, xm4, 0
>> +%if mmsize == 32
>> +    vinsertf128 m4, m4, xm4, 1
>> +%endif
>>       shl       sized,   2
>>       add       inq, sizeq
>>       add       outq, sizeq
>> @@ -84,3 +99,9 @@ cglobal aac_quantize_bands, 5, 5, 6, out, in, scaled, size, is_signed, maxval, Q
>>       add       sizeq, mmsize
>>       jl       .loop
>>       RET
>> +%endmacro
>> +
>> +INIT_XMM sse2
>> +AAC_QUANTIZE_BANDS
>> +INIT_YMM avx
>> +AAC_QUANTIZE_BANDS
>> diff --git a/libavcodec/x86/aacencdsp_init.c b/libavcodec/x86/aacencdsp_init.c
>> index e0d8dec4f8..cf17dbf91d 100644
>> --- a/libavcodec/x86/aacencdsp_init.c
>> +++ b/libavcodec/x86/aacencdsp_init.c
>> @@ -30,6 +30,9 @@ void ff_abs_pow34_sse(float *out, const float *in, const int size);
>>   void ff_aac_quantize_bands_sse2(int *out, const float *in, const float *scaled,
>>                                   int size, int is_signed, int maxval, const float Q34,
>>                                   const float rounding);
>> +void ff_aac_quantize_bands_avx(int *out, const float *in, const float *scaled,
>> +                               int size, int is_signed, int maxval, const float Q34,
>> +                               const float rounding);
>>   
>>   av_cold void ff_aacenc_dsp_init_x86(AACEncDSPContext *s)
>>   {
>> @@ -40,4 +43,7 @@ av_cold void ff_aacenc_dsp_init_x86(AACEncDSPContext *s)
>>   
>>       if (EXTERNAL_SSE2(cpu_flags))
>>           s->quant_bands = ff_aac_quantize_bands_sse2;
> 
> Seems like the commit message is wrong: You are not adding an SSE2 version.
> 
>> +
>> +    if (EXTERNAL_AVX_FAST(cpu_flags))
>> +        s->quant_bands = ff_aac_quantize_bands_avx;
>>   }
>> diff --git a/tests/checkasm/aacencdsp.c b/tests/checkasm/aacencdsp.c
>> index 791dd30320..5308a2ac03 100644
>> --- a/tests/checkasm/aacencdsp.c
>> +++ b/tests/checkasm/aacencdsp.c
>> @@ -81,8 +81,8 @@ static void test_quant_bands(AACEncDSPContext *s)
>>       for (int sign = 0; sign <= 1; sign++) {
>>           if (check_func(s->quant_bands, "quant_bands_%s",
>>                          sign ? "signed" : "unsigned")) {
>> -            LOCAL_ALIGNED_16(int, out, [BUF_SIZE]);
>> -            LOCAL_ALIGNED_16(int, out2, [BUF_SIZE]);
>> +            LOCAL_ALIGNED_32(int, out, [BUF_SIZE]);
>> +            LOCAL_ALIGNED_32(int, out2, [BUF_SIZE]);
>>   
>>               call_ref(out, in, scaled, BUF_SIZE, sign, maxval, q34, rounding);
>>               call_new(out2, in, scaled, BUF_SIZE, sign, maxval, q34, rounding);
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Rémi Denis-Courmont June 4, 2024, 6:37 a.m. UTC | #3
Le 4 juin 2024 04:23:43 GMT+03:00, James Almer <jamrial@gmail.com> a écrit :
>quant_bands_signed_sse2: 417.0
>quant_bands_signed_avx: 202.0

What about unsigned?
James Almer June 4, 2024, 3:52 p.m. UTC | #4
On 6/4/2024 3:37 AM, Rémi Denis-Courmont wrote:
> 
> 
> Le 4 juin 2024 04:23:43 GMT+03:00, James Almer <jamrial@gmail.com> a écrit :
>> quant_bands_signed_sse2: 417.0
>> quant_bands_signed_avx: 202.0
> 
> What about unsigned?

Pretty much the same. Will add them before pushing.
diff mbox series

Patch

diff --git a/libavcodec/aacenc.h b/libavcodec/aacenc.h
index d07960620e..ae15f91e06 100644
--- a/libavcodec/aacenc.h
+++ b/libavcodec/aacenc.h
@@ -242,7 +242,7 @@  typedef struct AACEncContext {
     enum RawDataBlockType cur_type;              ///< channel group type cur_channel belongs to
 
     AudioFrameQueue afq;
-    DECLARE_ALIGNED(16, int,   qcoefs)[96];      ///< quantized coefficients
+    DECLARE_ALIGNED(32, int,   qcoefs)[96];      ///< quantized coefficients
     DECLARE_ALIGNED(32, float, scoefs)[1024];    ///< scaled coefficients
 
     uint16_t quantize_band_cost_cache_generation;
diff --git a/libavcodec/x86/aacencdsp.asm b/libavcodec/x86/aacencdsp.asm
index 0d3ba4b89d..99be2d87f5 100644
--- a/libavcodec/x86/aacencdsp.asm
+++ b/libavcodec/x86/aacencdsp.asm
@@ -53,8 +53,19 @@  cglobal abs_pow34, 3, 3, 3, out, in, size
 ;                           int size, int is_signed, int maxval, const float Q34,
 ;                           const float rounding)
 ;*******************************************************************
-INIT_XMM sse2
+%macro AAC_QUANTIZE_BANDS 0
 cglobal aac_quantize_bands, 5, 5, 6, out, in, scaled, size, is_signed, maxval, Q34, rounding
+%if mmsize == 32
+    vbroadcastss m0, Q34m
+    vbroadcastss m1, roundingm
+%if UNIX64 == 0
+    cvtsi2ss xm3, dword maxvalm
+%else
+    cvtsi2ss xm3, maxvald
+%endif
+    shufps   xm3, xm3, xm3, 0
+    vinsertf128 m3, m3, xm3, 1
+%else ; mmsize == 16
 %if UNIX64 == 0
     movss     m0, Q34m
     movss     m1, roundingm
@@ -65,9 +76,13 @@  cglobal aac_quantize_bands, 5, 5, 6, out, in, scaled, size, is_signed, maxval, Q
     shufps    m0, m0, 0
     shufps    m1, m1, 0
     shufps    m3, m3, 0
+%endif
     shl       is_signedd, 31
-    movd      m4, is_signedd
-    shufps    m4, m4, 0
+    movd     xm4, is_signedd
+    shufps   xm4, xm4, xm4, 0
+%if mmsize == 32
+    vinsertf128 m4, m4, xm4, 1
+%endif
     shl       sized,   2
     add       inq, sizeq
     add       outq, sizeq
@@ -84,3 +99,9 @@  cglobal aac_quantize_bands, 5, 5, 6, out, in, scaled, size, is_signed, maxval, Q
     add       sizeq, mmsize
     jl       .loop
     RET
+%endmacro
+
+INIT_XMM sse2
+AAC_QUANTIZE_BANDS
+INIT_YMM avx
+AAC_QUANTIZE_BANDS
diff --git a/libavcodec/x86/aacencdsp_init.c b/libavcodec/x86/aacencdsp_init.c
index e0d8dec4f8..cf17dbf91d 100644
--- a/libavcodec/x86/aacencdsp_init.c
+++ b/libavcodec/x86/aacencdsp_init.c
@@ -30,6 +30,9 @@  void ff_abs_pow34_sse(float *out, const float *in, const int size);
 void ff_aac_quantize_bands_sse2(int *out, const float *in, const float *scaled,
                                 int size, int is_signed, int maxval, const float Q34,
                                 const float rounding);
+void ff_aac_quantize_bands_avx(int *out, const float *in, const float *scaled,
+                               int size, int is_signed, int maxval, const float Q34,
+                               const float rounding);
 
 av_cold void ff_aacenc_dsp_init_x86(AACEncDSPContext *s)
 {
@@ -40,4 +43,7 @@  av_cold void ff_aacenc_dsp_init_x86(AACEncDSPContext *s)
 
     if (EXTERNAL_SSE2(cpu_flags))
         s->quant_bands = ff_aac_quantize_bands_sse2;
+
+    if (EXTERNAL_AVX_FAST(cpu_flags))
+        s->quant_bands = ff_aac_quantize_bands_avx;
 }
diff --git a/tests/checkasm/aacencdsp.c b/tests/checkasm/aacencdsp.c
index 791dd30320..5308a2ac03 100644
--- a/tests/checkasm/aacencdsp.c
+++ b/tests/checkasm/aacencdsp.c
@@ -81,8 +81,8 @@  static void test_quant_bands(AACEncDSPContext *s)
     for (int sign = 0; sign <= 1; sign++) {
         if (check_func(s->quant_bands, "quant_bands_%s",
                        sign ? "signed" : "unsigned")) {
-            LOCAL_ALIGNED_16(int, out, [BUF_SIZE]);
-            LOCAL_ALIGNED_16(int, out2, [BUF_SIZE]);
+            LOCAL_ALIGNED_32(int, out, [BUF_SIZE]);
+            LOCAL_ALIGNED_32(int, out2, [BUF_SIZE]);
 
             call_ref(out, in, scaled, BUF_SIZE, sign, maxval, q34, rounding);
             call_new(out2, in, scaled, BUF_SIZE, sign, maxval, q34, rounding);