diff mbox

[FFmpeg-devel] swresample/x86/resample: extend resample_double to support avx and fma3

Message ID 20170316043748.21058-1-mfcc64@gmail.com
State Accepted
Commit de1308429ae649c899b74365f0dc72847676ba75
Headers show

Commit Message

Muhammad Faiz March 16, 2017, 4:37 a.m. UTC
benchmark:
sse2 10.670s
avx   8.763s
fma3  8.380s

Signed-off-by: Muhammad Faiz <mfcc64@gmail.com>
---
 libswresample/x86/resample.asm    | 15 ++++++++++++---
 libswresample/x86/resample_init.c | 10 ++++++++++
 2 files changed, 22 insertions(+), 3 deletions(-)

Comments

Muhammad Faiz March 19, 2017, 5:48 a.m. UTC | #1
On Thu, Mar 16, 2017 at 11:37 AM, Muhammad Faiz <mfcc64@gmail.com> wrote:
> benchmark:
> sse2 10.670s
> avx   8.763s
> fma3  8.380s
>
> Signed-off-by: Muhammad Faiz <mfcc64@gmail.com>
> ---
>  libswresample/x86/resample.asm    | 15 ++++++++++++---
>  libswresample/x86/resample_init.c | 10 ++++++++++
>  2 files changed, 22 insertions(+), 3 deletions(-)
>
> diff --git a/libswresample/x86/resample.asm b/libswresample/x86/resample.asm
> index 4163df1..7107cf9 100644
> --- a/libswresample/x86/resample.asm
> +++ b/libswresample/x86/resample.asm
> @@ -203,7 +203,7 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_count, dst, frac, \
>      ; horizontal sum & store
>  %if mmsize == 32
>      vextractf128                 xm1, m0, 0x1
> -    addps                        xm0, xm1
> +    addp%4                       xm0, xm1
>  %endif
>      movhlps                      xm1, xm0
>  %ifidn %1, float
> @@ -489,8 +489,8 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
>  %if mmsize == 32
>      vextractf128                 xm1, m0, 0x1
>      vextractf128                 xm3, m2, 0x1
> -    addps                        xm0, xm1
> -    addps                        xm2, xm3
> +    addp%4                       xm0, xm1
> +    addp%4                       xm2, xm3
>  %endif
>      cvtsi2s%4                    xm1, fracd
>      subp%4                       xm2, xm0
> @@ -608,3 +608,12 @@ RESAMPLE_FNS int16, 2, 1
>
>  INIT_XMM sse2
>  RESAMPLE_FNS double, 8, 3, d, pdbl_1
> +
> +%if HAVE_AVX_EXTERNAL
> +INIT_YMM avx
> +RESAMPLE_FNS double, 8, 3, d, pdbl_1
> +%endif
> +%if HAVE_FMA3_EXTERNAL
> +INIT_YMM fma3
> +RESAMPLE_FNS double, 8, 3, d, pdbl_1
> +%endif
> diff --git a/libswresample/x86/resample_init.c b/libswresample/x86/resample_init.c
> index e515762..c6b2a36 100644
> --- a/libswresample/x86/resample_init.c
> +++ b/libswresample/x86/resample_init.c
> @@ -42,6 +42,8 @@ RESAMPLE_FUNCS(float,  avx);
>  RESAMPLE_FUNCS(float,  fma3);
>  RESAMPLE_FUNCS(float,  fma4);
>  RESAMPLE_FUNCS(double, sse2);
> +RESAMPLE_FUNCS(double, avx);
> +RESAMPLE_FUNCS(double, fma3);
>
>  av_cold void swri_resample_dsp_x86_init(ResampleContext *c)
>  {
> @@ -85,6 +87,14 @@ av_cold void swri_resample_dsp_x86_init(ResampleContext *c)
>              c->dsp.resample_linear = ff_resample_linear_double_sse2;
>              c->dsp.resample_common = ff_resample_common_double_sse2;
>          }
> +        if (EXTERNAL_AVX_FAST(mm_flags)) {
> +            c->dsp.resample_linear = ff_resample_linear_double_avx;
> +            c->dsp.resample_common = ff_resample_common_double_avx;
> +        }
> +        if (EXTERNAL_FMA3_FAST(mm_flags)) {
> +            c->dsp.resample_linear = ff_resample_linear_double_fma3;
> +            c->dsp.resample_common = ff_resample_common_double_fma3;
> +        }
>          break;
>      }
>  }
> --
> 2.9.3
>

Applied

Thank's
diff mbox

Patch

diff --git a/libswresample/x86/resample.asm b/libswresample/x86/resample.asm
index 4163df1..7107cf9 100644
--- a/libswresample/x86/resample.asm
+++ b/libswresample/x86/resample.asm
@@ -203,7 +203,7 @@  cglobal resample_common_%1, 1, 7, 2, ctx, phase_count, dst, frac, \
     ; horizontal sum & store
 %if mmsize == 32
     vextractf128                 xm1, m0, 0x1
-    addps                        xm0, xm1
+    addp%4                       xm0, xm1
 %endif
     movhlps                      xm1, xm0
 %ifidn %1, float
@@ -489,8 +489,8 @@  cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
 %if mmsize == 32
     vextractf128                 xm1, m0, 0x1
     vextractf128                 xm3, m2, 0x1
-    addps                        xm0, xm1
-    addps                        xm2, xm3
+    addp%4                       xm0, xm1
+    addp%4                       xm2, xm3
 %endif
     cvtsi2s%4                    xm1, fracd
     subp%4                       xm2, xm0
@@ -608,3 +608,12 @@  RESAMPLE_FNS int16, 2, 1
 
 INIT_XMM sse2
 RESAMPLE_FNS double, 8, 3, d, pdbl_1
+
+%if HAVE_AVX_EXTERNAL
+INIT_YMM avx
+RESAMPLE_FNS double, 8, 3, d, pdbl_1
+%endif
+%if HAVE_FMA3_EXTERNAL
+INIT_YMM fma3
+RESAMPLE_FNS double, 8, 3, d, pdbl_1
+%endif
diff --git a/libswresample/x86/resample_init.c b/libswresample/x86/resample_init.c
index e515762..c6b2a36 100644
--- a/libswresample/x86/resample_init.c
+++ b/libswresample/x86/resample_init.c
@@ -42,6 +42,8 @@  RESAMPLE_FUNCS(float,  avx);
 RESAMPLE_FUNCS(float,  fma3);
 RESAMPLE_FUNCS(float,  fma4);
 RESAMPLE_FUNCS(double, sse2);
+RESAMPLE_FUNCS(double, avx);
+RESAMPLE_FUNCS(double, fma3);
 
 av_cold void swri_resample_dsp_x86_init(ResampleContext *c)
 {
@@ -85,6 +87,14 @@  av_cold void swri_resample_dsp_x86_init(ResampleContext *c)
             c->dsp.resample_linear = ff_resample_linear_double_sse2;
             c->dsp.resample_common = ff_resample_common_double_sse2;
         }
+        if (EXTERNAL_AVX_FAST(mm_flags)) {
+            c->dsp.resample_linear = ff_resample_linear_double_avx;
+            c->dsp.resample_common = ff_resample_common_double_avx;
+        }
+        if (EXTERNAL_FMA3_FAST(mm_flags)) {
+            c->dsp.resample_linear = ff_resample_linear_double_fma3;
+            c->dsp.resample_common = ff_resample_common_double_fma3;
+        }
         break;
     }
 }