diff mbox series

[FFmpeg-devel] avfilter/x86/af_afir: add FMA3 SIMD

Message ID CAPYw7P450pS5M+Bs=7sycA=dANabn6mBE_DxKDev9CvVd+dRDQ@mail.gmail.com
State New
Headers show
Series [FFmpeg-devel] avfilter/x86/af_afir: add FMA3 SIMD | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Paul B Mahol Sept. 10, 2023, 7:03 p.m. UTC
Attached.

Comments

Paul B Mahol Sept. 16, 2023, 12:22 p.m. UTC | #1
Will apply soon.
diff mbox series

Patch

From 7735a84fd0fdae731955f50bddba8dfef395713b Mon Sep 17 00:00:00 2001
From: Paul B Mahol <onemda@gmail.com>
Date: Sun, 10 Sep 2023 19:25:20 +0200
Subject: [PATCH] avfilter/x86/af_afir: add FMA3 SIMD

Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
 libavfilter/x86/af_afir.asm    | 27 +++++++++++++++++++++++++++
 libavfilter/x86/af_afir_init.c |  5 +++++
 2 files changed, 32 insertions(+)

diff --git a/libavfilter/x86/af_afir.asm b/libavfilter/x86/af_afir.asm
index 2cc09709a2..ed0276c7b9 100644
--- a/libavfilter/x86/af_afir.asm
+++ b/libavfilter/x86/af_afir.asm
@@ -67,3 +67,30 @@  INIT_XMM sse3
 FCMUL_ADD
 INIT_YMM avx
 FCMUL_ADD
+
+%if HAVE_FMA3_EXTERNAL
+INIT_YMM fma3
+cglobal fcmul_add, 4,4,4, sum, t, c, len
+    shl       lend, 3
+    add         tq, lenq
+    add         cq, lenq
+    add       sumq, lenq
+    neg       lenq
+.loop:
+    movaps    m0, [tq + lenq]
+    movaps    m1, [cq + lenq]
+    vpermilps m3, m0, 177
+    vpermilps m2, m1, 160
+    vpermilps m1, m1, 245
+    mulps     m1, m1, m3
+    vfmaddsub132ps m0, m1, m2
+    addps     m0, m0, [sumq + lenq]
+    movaps    [sumq + lenq], m0
+    add       lenq, mmsize
+    jl .loop
+    movss xm0, [tq + lenq]
+    mulss xm0, [cq + lenq]
+    addss xm0, [sumq + lenq]
+    movss [sumq + lenq], xm0
+    RET
+%endif
diff --git a/libavfilter/x86/af_afir_init.c b/libavfilter/x86/af_afir_init.c
index e53817b9c0..d573acf10b 100644
--- a/libavfilter/x86/af_afir_init.c
+++ b/libavfilter/x86/af_afir_init.c
@@ -26,6 +26,8 @@  void ff_fcmul_add_sse3(float *sum, const float *t, const float *c,
                        ptrdiff_t len);
 void ff_fcmul_add_avx(float *sum, const float *t, const float *c,
                       ptrdiff_t len);
+void ff_fcmul_add_fma3(float *sum, const float *t, const float *c,
+                       ptrdiff_t len);
 
 av_cold void ff_afir_init_x86(AudioFIRDSPContext *s)
 {
@@ -37,4 +39,7 @@  av_cold void ff_afir_init_x86(AudioFIRDSPContext *s)
     if (EXTERNAL_AVX_FAST(cpu_flags)) {
         s->fcmul_add = ff_fcmul_add_avx;
     }
+    if (EXTERNAL_FMA3_FAST(cpu_flags)) {
+        s->fcmul_add = ff_fcmul_add_fma3;
+    }
 }
-- 
2.39.1