diff mbox series

[FFmpeg-devel] avcodec: flac x86 asm fix

Message ID CAPYw7P4HsGHG_ys1gmJiXUzqWjkKHM8k4nYdnX5k50NuU86QnA@mail.gmail.com
State New
Headers show
Series [FFmpeg-devel] avcodec: flac x86 asm fix | expand

Checks

Context Check Description
yinshiyou/configure_loongarch64 warning Failed to apply patch
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Paul B Mahol Sept. 3, 2022, 9:45 p.m. UTC
Patch attached.

Comments

James Almer Sept. 5, 2022, 11:19 a.m. UTC | #1
On 9/3/2022 6:45 PM, Paul B Mahol wrote:
> Patch attached.

[...]

> From 0f220489eb6402c6be3bc1d897c95fa9bc10431c Mon Sep 17 00:00:00 2001
> From: Paul B Mahol <onemda@gmail.com>
> Date: Sat, 3 Sep 2022 23:41:38 +0200
> Subject: [PATCH] avcodec/x86/flacdsp: fix bug in decorrelation
> 
> Fixes #9297
> 
> Signed-off-by: Paul B Mahol <onemda@gmail.com>
> ---
>  libavcodec/x86/flacdsp.asm    | 23 +++++++++++++++-----
>  libavcodec/x86/flacdsp_init.c | 41 ++++++++++++++++++++++-------------
>  2 files changed, 44 insertions(+), 20 deletions(-)
> 
> diff --git a/libavcodec/x86/flacdsp.asm b/libavcodec/x86/flacdsp.asm
> index 7138611526..6d755f4972 100644
> --- a/libavcodec/x86/flacdsp.asm
> +++ b/libavcodec/x86/flacdsp.asm
> @@ -23,6 +23,10 @@
>  
>  %include "libavutil/x86/x86util.asm"
>  
> +SECTION_RODATA
> +
> +vector:  db 0,1,4,5,8,9,12,13,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,1,4,5,8,9,12,13,
> +
>  SECTION .text
>  
>  %macro PMACSDQL 5
> @@ -89,6 +93,9 @@ LPC_32 sse4
>  ;----------------------------------------------------------------------------------
>  %macro FLAC_DECORRELATE_16 3-4
>  cglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len
> +%ifidn %1, indep2
> +    VBROADCASTI128 m2, [vector]

You're loading 16 bytes, yet vector is 32 bytes. Are you sure this is 
ok? The upper 16 bytes look different, which makes me think you meant to 
use them.

> +%endif
>  %if ARCH_X86_32
>      mov      lend, lenm
>  %endif
> @@ -112,11 +119,17 @@ align 16
>  %endif
>  %ifnidn %1, indep2
>      p%4d       m2, m0, m1
> +    packssdw   m%2, m%2
> +    packssdw   m%3, m%3
> +    punpcklwd  m%2, m%3
> +    psllw      m%2, m3
> +%else
> +    pslld      m%2, m3
> +    pslld      m%3, m3
> +    pshufb     m%2, m%2, m2
> +    pshufb     m%3, m%3, m2
> +    punpcklwd  m%2, m%3
>  %endif
> -    packssdw  m%2, m%2
> -    packssdw  m%3, m%3
> -    punpcklwd m%2, m%3
> -    psllw     m%2, m3
>      mova [outq + lenq], m%2
>      add      lenq, 16
>      jl .loop
> @@ -292,7 +305,7 @@ align 16
>      REP_RET
>  %endmacro
>  
> -INIT_XMM sse2
> +INIT_XMM ssse3

Why are you turning all the unrelated indep functions into ssse3 when 
you only changed the FLAC_DECORRELATE_16 macro to fix the stereo 16bit 
indep path?

And did you try using FLAC_DECORRELATE_INDEP for it instead? I did 
mention below i purposely used a different macro for that specific 
scenario (probably to save some instructions), so the fix could have 
been using the proper indep macro.

>  FLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro
>  FLAC_DECORRELATE_INDEP 32, 2, 3, d
>  FLAC_DECORRELATE_INDEP 16, 4, 3, w
> diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c
> index 2deaf3117f..48e3e7c55c 100644
> --- a/libavcodec/x86/flacdsp_init.c
> +++ b/libavcodec/x86/flacdsp_init.c
> @@ -34,7 +34,9 @@ void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int chann
>  void ff_flac_decorrelate_rs_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
>                                            int len, int shift);                           \
>  void ff_flac_decorrelate_ms_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
> -                                          int len, int shift);                           \
> +                                          int len, int shift);
> +
> +#define DECORRELATE_IFUNCS(fmt, opt)                                                     \
>  void ff_flac_decorrelate_indep2_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
>                                               int len, int shift);                        \
>  void ff_flac_decorrelate_indep4_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
> @@ -48,6 +50,10 @@ DECORRELATE_FUNCS(16, sse2);
>  DECORRELATE_FUNCS(16,  avx);
>  DECORRELATE_FUNCS(32, sse2);
>  DECORRELATE_FUNCS(32,  avx);
> +DECORRELATE_IFUNCS(16, ssse3);
> +DECORRELATE_IFUNCS(16,  avx);
> +DECORRELATE_IFUNCS(32, ssse3);
> +DECORRELATE_IFUNCS(32,  avx);
>  
>  av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int channels)
>  {
> @@ -55,30 +61,35 @@ av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int
>      int cpu_flags = av_get_cpu_flags();
>  
>      if (EXTERNAL_SSE2(cpu_flags)) {
> +        if (fmt == AV_SAMPLE_FMT_S16) {
> +            c->decorrelate[1] = ff_flac_decorrelate_ls_16_sse2;
> +            c->decorrelate[2] = ff_flac_decorrelate_rs_16_sse2;
> +            c->decorrelate[3] = ff_flac_decorrelate_ms_16_sse2;
> +        } else if (fmt == AV_SAMPLE_FMT_S32) {
> +            c->decorrelate[1] = ff_flac_decorrelate_ls_32_sse2;
> +            c->decorrelate[2] = ff_flac_decorrelate_rs_32_sse2;
> +            c->decorrelate[3] = ff_flac_decorrelate_ms_32_sse2;
> +        }
> +    }
> +    if (EXTERNAL_SSSE3(cpu_flags)) {
>          if (fmt == AV_SAMPLE_FMT_S16) {
>              if (channels == 2)
> -                c->decorrelate[0] = ff_flac_decorrelate_indep2_16_sse2;
> +                c->decorrelate[0] = ff_flac_decorrelate_indep2_16_ssse3;
>              else if (channels == 4)
> -                c->decorrelate[0] = ff_flac_decorrelate_indep4_16_sse2;
> +                c->decorrelate[0] = ff_flac_decorrelate_indep4_16_ssse3;
>              else if (channels == 6)
> -                c->decorrelate[0] = ff_flac_decorrelate_indep6_16_sse2;
> +                c->decorrelate[0] = ff_flac_decorrelate_indep6_16_ssse3;
>              else if (ARCH_X86_64 && channels == 8)
> -                c->decorrelate[0] = ff_flac_decorrelate_indep8_16_sse2;
> -            c->decorrelate[1] = ff_flac_decorrelate_ls_16_sse2;
> -            c->decorrelate[2] = ff_flac_decorrelate_rs_16_sse2;
> -            c->decorrelate[3] = ff_flac_decorrelate_ms_16_sse2;
> +                c->decorrelate[0] = ff_flac_decorrelate_indep8_16_ssse3;
>          } else if (fmt == AV_SAMPLE_FMT_S32) {
>              if (channels == 2)
> -                c->decorrelate[0] = ff_flac_decorrelate_indep2_32_sse2;
> +                c->decorrelate[0] = ff_flac_decorrelate_indep2_32_ssse3;
>              else if (channels == 4)
> -                c->decorrelate[0] = ff_flac_decorrelate_indep4_32_sse2;
> +                c->decorrelate[0] = ff_flac_decorrelate_indep4_32_ssse3;
>              else if (channels == 6)
> -                c->decorrelate[0] = ff_flac_decorrelate_indep6_32_sse2;
> +                c->decorrelate[0] = ff_flac_decorrelate_indep6_32_ssse3;
>              else if (ARCH_X86_64 && channels == 8)
> -                c->decorrelate[0] = ff_flac_decorrelate_indep8_32_sse2;
> -            c->decorrelate[1] = ff_flac_decorrelate_ls_32_sse2;
> -            c->decorrelate[2] = ff_flac_decorrelate_rs_32_sse2;
> -            c->decorrelate[3] = ff_flac_decorrelate_ms_32_sse2;
> +                c->decorrelate[0] = ff_flac_decorrelate_indep8_32_ssse3;
>          }
>      }
>      if (EXTERNAL_SSE4(cpu_flags)) {
> -- 
> 2.37.2
>
diff mbox series

Patch

From 0f220489eb6402c6be3bc1d897c95fa9bc10431c Mon Sep 17 00:00:00 2001
From: Paul B Mahol <onemda@gmail.com>
Date: Sat, 3 Sep 2022 23:41:38 +0200
Subject: [PATCH] avcodec/x86/flacdsp: fix bug in decorrelation

Fixes #9297

Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
 libavcodec/x86/flacdsp.asm    | 23 +++++++++++++++-----
 libavcodec/x86/flacdsp_init.c | 41 ++++++++++++++++++++++-------------
 2 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/libavcodec/x86/flacdsp.asm b/libavcodec/x86/flacdsp.asm
index 7138611526..6d755f4972 100644
--- a/libavcodec/x86/flacdsp.asm
+++ b/libavcodec/x86/flacdsp.asm
@@ -23,6 +23,10 @@ 
 
 %include "libavutil/x86/x86util.asm"
 
+SECTION_RODATA
+
+vector:  db 0,1,4,5,8,9,12,13,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,1,4,5,8,9,12,13,
+
 SECTION .text
 
 %macro PMACSDQL 5
@@ -89,6 +93,9 @@  LPC_32 sse4
 ;----------------------------------------------------------------------------------
 %macro FLAC_DECORRELATE_16 3-4
 cglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len
+%ifidn %1, indep2
+    VBROADCASTI128 m2, [vector]
+%endif
 %if ARCH_X86_32
     mov      lend, lenm
 %endif
@@ -112,11 +119,17 @@  align 16
 %endif
 %ifnidn %1, indep2
     p%4d       m2, m0, m1
+    packssdw   m%2, m%2
+    packssdw   m%3, m%3
+    punpcklwd  m%2, m%3
+    psllw      m%2, m3
+%else
+    pslld      m%2, m3
+    pslld      m%3, m3
+    pshufb     m%2, m%2, m2
+    pshufb     m%3, m%3, m2
+    punpcklwd  m%2, m%3
 %endif
-    packssdw  m%2, m%2
-    packssdw  m%3, m%3
-    punpcklwd m%2, m%3
-    psllw     m%2, m3
     mova [outq + lenq], m%2
     add      lenq, 16
     jl .loop
@@ -292,7 +305,7 @@  align 16
     REP_RET
 %endmacro
 
-INIT_XMM sse2
+INIT_XMM ssse3
 FLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro
 FLAC_DECORRELATE_INDEP 32, 2, 3, d
 FLAC_DECORRELATE_INDEP 16, 4, 3, w
diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c
index 2deaf3117f..48e3e7c55c 100644
--- a/libavcodec/x86/flacdsp_init.c
+++ b/libavcodec/x86/flacdsp_init.c
@@ -34,7 +34,9 @@  void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int chann
 void ff_flac_decorrelate_rs_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
                                           int len, int shift);                           \
 void ff_flac_decorrelate_ms_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
-                                          int len, int shift);                           \
+                                          int len, int shift);
+
+#define DECORRELATE_IFUNCS(fmt, opt)                                                     \
 void ff_flac_decorrelate_indep2_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
                                              int len, int shift);                        \
 void ff_flac_decorrelate_indep4_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
@@ -48,6 +50,10 @@  DECORRELATE_FUNCS(16, sse2);
 DECORRELATE_FUNCS(16,  avx);
 DECORRELATE_FUNCS(32, sse2);
 DECORRELATE_FUNCS(32,  avx);
+DECORRELATE_IFUNCS(16, ssse3);
+DECORRELATE_IFUNCS(16,  avx);
+DECORRELATE_IFUNCS(32, ssse3);
+DECORRELATE_IFUNCS(32,  avx);
 
 av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int channels)
 {
@@ -55,30 +61,35 @@  av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int
     int cpu_flags = av_get_cpu_flags();
 
     if (EXTERNAL_SSE2(cpu_flags)) {
+        if (fmt == AV_SAMPLE_FMT_S16) {
+            c->decorrelate[1] = ff_flac_decorrelate_ls_16_sse2;
+            c->decorrelate[2] = ff_flac_decorrelate_rs_16_sse2;
+            c->decorrelate[3] = ff_flac_decorrelate_ms_16_sse2;
+        } else if (fmt == AV_SAMPLE_FMT_S32) {
+            c->decorrelate[1] = ff_flac_decorrelate_ls_32_sse2;
+            c->decorrelate[2] = ff_flac_decorrelate_rs_32_sse2;
+            c->decorrelate[3] = ff_flac_decorrelate_ms_32_sse2;
+        }
+    }
+    if (EXTERNAL_SSSE3(cpu_flags)) {
         if (fmt == AV_SAMPLE_FMT_S16) {
             if (channels == 2)
-                c->decorrelate[0] = ff_flac_decorrelate_indep2_16_sse2;
+                c->decorrelate[0] = ff_flac_decorrelate_indep2_16_ssse3;
             else if (channels == 4)
-                c->decorrelate[0] = ff_flac_decorrelate_indep4_16_sse2;
+                c->decorrelate[0] = ff_flac_decorrelate_indep4_16_ssse3;
             else if (channels == 6)
-                c->decorrelate[0] = ff_flac_decorrelate_indep6_16_sse2;
+                c->decorrelate[0] = ff_flac_decorrelate_indep6_16_ssse3;
             else if (ARCH_X86_64 && channels == 8)
-                c->decorrelate[0] = ff_flac_decorrelate_indep8_16_sse2;
-            c->decorrelate[1] = ff_flac_decorrelate_ls_16_sse2;
-            c->decorrelate[2] = ff_flac_decorrelate_rs_16_sse2;
-            c->decorrelate[3] = ff_flac_decorrelate_ms_16_sse2;
+                c->decorrelate[0] = ff_flac_decorrelate_indep8_16_ssse3;
         } else if (fmt == AV_SAMPLE_FMT_S32) {
             if (channels == 2)
-                c->decorrelate[0] = ff_flac_decorrelate_indep2_32_sse2;
+                c->decorrelate[0] = ff_flac_decorrelate_indep2_32_ssse3;
             else if (channels == 4)
-                c->decorrelate[0] = ff_flac_decorrelate_indep4_32_sse2;
+                c->decorrelate[0] = ff_flac_decorrelate_indep4_32_ssse3;
             else if (channels == 6)
-                c->decorrelate[0] = ff_flac_decorrelate_indep6_32_sse2;
+                c->decorrelate[0] = ff_flac_decorrelate_indep6_32_ssse3;
             else if (ARCH_X86_64 && channels == 8)
-                c->decorrelate[0] = ff_flac_decorrelate_indep8_32_sse2;
-            c->decorrelate[1] = ff_flac_decorrelate_ls_32_sse2;
-            c->decorrelate[2] = ff_flac_decorrelate_rs_32_sse2;
-            c->decorrelate[3] = ff_flac_decorrelate_ms_32_sse2;
+                c->decorrelate[0] = ff_flac_decorrelate_indep8_32_ssse3;
         }
     }
     if (EXTERNAL_SSE4(cpu_flags)) {
-- 
2.37.2