diff mbox series

[FFmpeg-devel,2/3] x86/ac3dsp: add ff_float_to_fixed24_avx2()

Message ID 20231122194913.9856-2-jamrial@gmail.com
State New
Headers show
Series [FFmpeg-devel,1/3] x86/ac3dsp: reduce instruction count inside the float_to_fixed24 loop | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

James Almer Nov. 22, 2023, 7:49 p.m. UTC
Signed-off-by: James Almer <jamrial@gmail.com>
---
 libavcodec/ac3dsp.h          |  4 ++--
 libavcodec/ac3enc_template.c |  2 +-
 libavcodec/x86/ac3dsp.asm    | 28 ++++++++++++++++++++++++++--
 libavcodec/x86/ac3dsp_init.c |  4 ++++
 4 files changed, 33 insertions(+), 5 deletions(-)

Comments

Kieran Kunhya Nov. 23, 2023, 6:56 a.m. UTC | #1
On Wed, 22 Nov 2023, 19:49 James Almer, <jamrial@gmail.com> wrote:

> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
>  libavcodec/ac3dsp.h          |  4 ++--
>  libavcodec/ac3enc_template.c |  2 +-
>  libavcodec/x86/ac3dsp.asm    | 28 ++++++++++++++++++++++++++--
>  libavcodec/x86/ac3dsp_init.c |  4 ++++
>  4 files changed, 33 insertions(+), 5 deletions(-)
>

I think this is AVX, I might be wrong but I don't see any AVX2.

Kieran

>
James Almer Nov. 23, 2023, 11:51 a.m. UTC | #2
On 11/23/2023 3:56 AM, Kieran Kunhya wrote:
> On Wed, 22 Nov 2023, 19:49 James Almer, <jamrial@gmail.com> wrote:
> 
>> Signed-off-by: James Almer <jamrial@gmail.com>
>> ---
>>   libavcodec/ac3dsp.h          |  4 ++--
>>   libavcodec/ac3enc_template.c |  2 +-
>>   libavcodec/x86/ac3dsp.asm    | 28 ++++++++++++++++++++++++++--
>>   libavcodec/x86/ac3dsp_init.c |  4 ++++
>>   4 files changed, 33 insertions(+), 5 deletions(-)
>>
> 
> I think this is AVX, I might be wrong but I don't see any AVX2.
> 
> Kieran

movdqa wiht ymm is avx2. I could change it to movaps, but technically 
the registers contain floats and i don't know if any old AVX cpu has 
penalties for changing domains.
Henrik Gramner Nov. 23, 2023, 3:19 p.m. UTC | #3
On Thu, Nov 23, 2023 at 12:51 PM James Almer <jamrial@gmail.com> wrote:
> movdqa wiht ymm is avx2. I could change it to movaps, but technically
> the registers contain floats and i don't know if any old AVX cpu has
> penalties for changing domains.

Fwiw I believe what domain the result of fp <-> int conversion
instructions belongs to actually differs between µarchs. Realistically
whether movaps or movdqa is used to store the result to memory is
unlikely to matter in practice though.
diff mbox series

Patch

diff --git a/libavcodec/ac3dsp.h b/libavcodec/ac3dsp.h
index a01bff3d11..25341f3396 100644
--- a/libavcodec/ac3dsp.h
+++ b/libavcodec/ac3dsp.h
@@ -47,9 +47,9 @@  typedef struct AC3DSPContext {
      * [-(1<<24),(1<<24)]
      *
      * @param dst destination array of int32_t.
-     *            constraints: 16-byte aligned
+     *            constraints: 32-byte aligned
      * @param src source array of float.
-     *            constraints: 16-byte aligned
+     *            constraints: 32-byte aligned
      * @param len number of elements to convert.
      *            constraints: multiple of 32 greater than zero
      */
diff --git a/libavcodec/ac3enc_template.c b/libavcodec/ac3enc_template.c
index be4ecebc9c..a16faea681 100644
--- a/libavcodec/ac3enc_template.c
+++ b/libavcodec/ac3enc_template.c
@@ -112,7 +112,7 @@  static void apply_channel_coupling(AC3EncodeContext *s)
 {
     LOCAL_ALIGNED_16(CoefType, cpl_coords,      [AC3_MAX_BLOCKS], [AC3_MAX_CHANNELS][16]);
 #if AC3ENC_FLOAT
-    LOCAL_ALIGNED_16(int32_t, fixed_cpl_coords, [AC3_MAX_BLOCKS], [AC3_MAX_CHANNELS][16]);
+    LOCAL_ALIGNED_32(int32_t, fixed_cpl_coords, [AC3_MAX_BLOCKS], [AC3_MAX_CHANNELS][16]);
 #else
     int32_t (*fixed_cpl_coords)[AC3_MAX_CHANNELS][16] = cpl_coords;
 #endif
diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
index 42c8310462..e31c58e1c1 100644
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@@ -21,10 +21,10 @@ 
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
 
 ; 16777216.0f - used in ff_float_to_fixed24()
-pf_1_24: times 4 dd 0x4B800000
+pf_1_24: times 8 dd 0x4B800000
 
 ; used in ff_ac3_compute_mantissa_size()
 cextern ac3_bap_bits
@@ -128,6 +128,30 @@  cglobal float_to_fixed24, 3, 3, 9, dst, src, len
     jl .loop
     RET
 
+INIT_YMM avx2
+cglobal float_to_fixed24, 3, 3, 5, dst, src, len
+    movaps     m0, [pf_1_24]
+    shl      lenq, 2
+    add      srcq, lenq
+    add      dstq, lenq
+    neg      lenq
+.loop:
+    mulps      m1, m0, [srcq+lenq+mmsize*0]
+    mulps      m2, m0, [srcq+lenq+mmsize*1]
+    mulps      m3, m0, [srcq+lenq+mmsize*2]
+    mulps      m4, m0, [srcq+lenq+mmsize*3]
+    cvtps2dq   m1, m1
+    cvtps2dq   m2, m2
+    cvtps2dq   m3, m3
+    cvtps2dq   m4, m4
+    movdqa  [dstq+lenq+mmsize*0], m1
+    movdqa  [dstq+lenq+mmsize*1], m2
+    movdqa  [dstq+lenq+mmsize*2], m3
+    movdqa  [dstq+lenq+mmsize*3], m4
+    add      lenq, mmsize*4
+    jl .loop
+    RET
+
 ;------------------------------------------------------------------------------
 ; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16])
 ;------------------------------------------------------------------------------
diff --git a/libavcodec/x86/ac3dsp_init.c b/libavcodec/x86/ac3dsp_init.c
index 43b3b4ac85..106121b5b9 100644
--- a/libavcodec/x86/ac3dsp_init.c
+++ b/libavcodec/x86/ac3dsp_init.c
@@ -27,6 +27,7 @@ 
 void ff_ac3_exponent_min_sse2  (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
 
 void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len);
+void ff_float_to_fixed24_avx2 (int32_t *dst, const float *src, unsigned int len);
 
 int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]);
 
@@ -48,6 +49,9 @@  av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
         if (!(cpu_flags & AV_CPU_FLAG_ATOM))
             c->extract_exponents = ff_ac3_extract_exponents_ssse3;
     }
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        c->float_to_fixed24 = ff_float_to_fixed24_avx2;
+    }
 }
 
 #define DOWNMIX_FUNC_OPT(ch, opt)                                       \