diff mbox

[FFmpeg-devel,2/2] x86/aacps: add ff_ps_stereo_interpolate_ipdopd_sse3()

Message ID 20170523190118.3524-2-jamrial@gmail.com
State Accepted
Commit b5a0971ff041badbdd1482e4ae2a0a16700a748f
Headers show

Commit Message

James Almer May 23, 2017, 7:01 p.m. UTC
About 2x faster than the c version.

Signed-off-by: James Almer <jamrial@gmail.com>
---
 libavcodec/x86/aacpsdsp.asm    | 51 ++++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/aacpsdsp_init.c |  4 ++++
 2 files changed, 55 insertions(+)

Comments

James Almer June 2, 2017, 2:10 p.m. UTC | #1
On 5/23/2017 4:01 PM, James Almer wrote:
> About 2x faster than the c version.
> 
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
>  libavcodec/x86/aacpsdsp.asm    | 51 ++++++++++++++++++++++++++++++++++++++++++
>  libavcodec/x86/aacpsdsp_init.c |  4 ++++
>  2 files changed, 55 insertions(+)

Pushed.
diff mbox

Patch

diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm
index e92cbbce08..bb8a7f5df0 100644
--- a/libavcodec/x86/aacpsdsp.asm
+++ b/libavcodec/x86/aacpsdsp.asm
@@ -117,6 +117,57 @@  align 16
 .ret:
     REP_RET
 
+;***************************************************************************
+;void ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
+;                                       float h[2][4], float h_step[2][4],
+;                                       int len);
+;***************************************************************************
+INIT_XMM sse3
+cglobal ps_stereo_interpolate_ipdopd, 5, 5, 10, l, r, h, h_step, n
+    cmp      nd, 0
+    jle .ret
+    movaps   m0, [hq]
+    movaps   m1, [hq+mmsize]
+%if ARCH_X86_64
+    movaps   m8, [h_stepq]
+    movaps   m9, [h_stepq+mmsize]
+    %define  H_STEP0 m8
+    %define  H_STEP1 m9
+%else
+    %define  H_STEP0 [h_stepq]
+    %define  H_STEP1 [h_stepq+mmsize]
+%endif
+    shl      nd, 3
+    add      lq, nq
+    add      rq, nq
+    neg      nq
+
+align 16
+.loop:
+    addps    m0, H_STEP0
+    addps    m1, H_STEP1
+    movddup  m2, [lq+nq]
+    movddup  m3, [rq+nq]
+    shufps   m4, m2, m2, q2301
+    shufps   m5, m3, m3, q2301
+    unpcklps m6, m0, m0
+    unpckhps m7, m0, m0
+    mulps    m2, m6
+    mulps    m3, m7
+    unpcklps m6, m1, m1
+    unpckhps m7, m1, m1
+    mulps    m4, m6
+    mulps    m5, m7
+    addps    m2, m3
+    addsubps m4, m5
+    addsubps m2, m4
+    movsd  [lq+nq], m2
+    movhps [rq+nq], m2
+    add      nq, 8
+    jl .loop
+.ret:
+    REP_RET
+
 ;*******************************************************************
 ;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
 ;                                 const float (*filter)[8][2],
diff --git a/libavcodec/x86/aacpsdsp_init.c b/libavcodec/x86/aacpsdsp_init.c
index f6d6c039c3..767ae6588e 100644
--- a/libavcodec/x86/aacpsdsp_init.c
+++ b/libavcodec/x86/aacpsdsp_init.c
@@ -37,6 +37,9 @@  void ff_ps_hybrid_analysis_sse3(float (*out)[2], float (*in)[2],
 void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
                                    float h[2][4], float h_step[2][4],
                                    int len);
+void ff_ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
+                                          float h[2][4], float h_step[2][4],
+                                          int len);
 
 av_cold void ff_psdsp_init_x86(PSDSPContext *s)
 {
@@ -50,6 +53,7 @@  av_cold void ff_psdsp_init_x86(PSDSPContext *s)
     if (EXTERNAL_SSE3(cpu_flags)) {
         s->add_squares            = ff_ps_add_squares_sse3;
         s->stereo_interpolate[0]  = ff_ps_stereo_interpolate_sse3;
+        s->stereo_interpolate[1]  = ff_ps_stereo_interpolate_ipdopd_sse3;
         s->hybrid_analysis        = ff_ps_hybrid_analysis_sse3;
     }
 }